mirror of https://github.com/apache/lucene.git
LUCENE-2503: add light stemmers for european languages
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@964019 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d49603b939
commit
3241eb9291
|
@ -187,6 +187,9 @@ New features
|
||||||
* LUCENE-2464: FastVectorHighlighter: add SingleFragListBuilder to return
|
* LUCENE-2464: FastVectorHighlighter: add SingleFragListBuilder to return
|
||||||
entire field contents. (Koji Sekiguchi)
|
entire field contents. (Koji Sekiguchi)
|
||||||
|
|
||||||
|
* LUCENE-2503: Added lighter stemming alternatives for European languages.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
|
|
@ -17,30 +17,29 @@ were developed by Martin Porter and Richard Boulton.
|
||||||
The full snowball package is available from
|
The full snowball package is available from
|
||||||
http://snowball.tartarus.org/
|
http://snowball.tartarus.org/
|
||||||
|
|
||||||
The Arabic stemmer (common) comes with a default
|
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||||
common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt.
|
common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||||
|
common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||||
|
common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||||
|
common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||||
|
common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||||
|
|
||||||
The Persian analyzer (common) comes with a default
|
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
(common) are based on BSD-licensed reference implementations created by Jacques Savoy and
|
||||||
common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt.
|
Ljiljana Dolamic. These files reside in:
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
|
||||||
|
common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
|
||||||
The Romanian analyzer (common) comes with a default
|
common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
|
||||||
common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt.
|
common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
|
||||||
|
common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
|
||||||
The Bulgarian analyzer (common) comes with a default
|
common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
|
||||||
common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt.
|
common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
|
||||||
|
|
||||||
The Hindi analyzer (common) comes with a default
|
|
||||||
stopword list that is BSD-licensed created by Jacques Savoy. The file resides in
|
|
||||||
common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt.
|
|
||||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
|
||||||
|
|
||||||
The Stempel analyzer (stempel) includes BSD-licensed software developed
|
The Stempel analyzer (stempel) includes BSD-licensed software developed
|
||||||
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
|
by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.ar;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizer for Arabic.
|
* Normalizer for Arabic.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -96,20 +98,4 @@ public class ArabicNormalizer {
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete a character in-place
|
|
||||||
*
|
|
||||||
* @param s Input Buffer
|
|
||||||
* @param pos Position of character to delete
|
|
||||||
* @param len length of input buffer
|
|
||||||
* @return length of input buffer after deletion
|
|
||||||
*/
|
|
||||||
protected int delete(char s[], int pos, int len) {
|
|
||||||
if (pos < len)
|
|
||||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
|
||||||
|
|
||||||
return len - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
package org.apache.lucene.analysis.ar;
|
package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -16,6 +18,8 @@ package org.apache.lucene.analysis.ar;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stemmer for Arabic.
|
* Stemmer for Arabic.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -86,7 +90,7 @@ public class ArabicStemmer {
|
||||||
*/
|
*/
|
||||||
public int stemPrefix(char s[], int len) {
|
public int stemPrefix(char s[], int len) {
|
||||||
for (int i = 0; i < prefixes.length; i++)
|
for (int i = 0; i < prefixes.length; i++)
|
||||||
if (startsWith(s, len, prefixes[i]))
|
if (startsWithCheckLength(s, len, prefixes[i]))
|
||||||
return deleteN(s, 0, len, prefixes[i].length);
|
return deleteN(s, 0, len, prefixes[i].length);
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
@ -99,7 +103,7 @@ public class ArabicStemmer {
|
||||||
*/
|
*/
|
||||||
public int stemSuffix(char s[], int len) {
|
public int stemSuffix(char s[], int len) {
|
||||||
for (int i = 0; i < suffixes.length; i++)
|
for (int i = 0; i < suffixes.length; i++)
|
||||||
if (endsWith(s, len, suffixes[i]))
|
if (endsWithCheckLength(s, len, suffixes[i]))
|
||||||
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
|
len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length);
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
@ -111,7 +115,7 @@ public class ArabicStemmer {
|
||||||
* @param prefix prefix to check
|
* @param prefix prefix to check
|
||||||
* @return true if the prefix matches and can be stemmed
|
* @return true if the prefix matches and can be stemmed
|
||||||
*/
|
*/
|
||||||
boolean startsWith(char s[], int len, char prefix[]) {
|
boolean startsWithCheckLength(char s[], int len, char prefix[]) {
|
||||||
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
|
if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters
|
||||||
return false;
|
return false;
|
||||||
} else if (len < prefix.length + 2) { // other prefixes require only 2.
|
} else if (len < prefix.length + 2) { // other prefixes require only 2.
|
||||||
|
@ -132,7 +136,7 @@ public class ArabicStemmer {
|
||||||
* @param suffix suffix to check
|
* @param suffix suffix to check
|
||||||
* @return true if the suffix matches and can be stemmed
|
* @return true if the suffix matches and can be stemmed
|
||||||
*/
|
*/
|
||||||
boolean endsWith(char s[], int len, char suffix[]) {
|
boolean endsWithCheckLength(char s[], int len, char suffix[]) {
|
||||||
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
|
if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
|
@ -142,37 +146,5 @@ public class ArabicStemmer {
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete n characters in-place
|
|
||||||
*
|
|
||||||
* @param s Input Buffer
|
|
||||||
* @param pos Position of character to delete
|
|
||||||
* @param len Length of input buffer
|
|
||||||
* @param nChars number of characters to delete
|
|
||||||
* @return length of input buffer after deletion
|
|
||||||
*/
|
|
||||||
protected int deleteN(char s[], int pos, int len, int nChars) {
|
|
||||||
for (int i = 0; i < nChars; i++)
|
|
||||||
len = delete(s, pos, len);
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete a character in-place
|
|
||||||
*
|
|
||||||
* @param s Input Buffer
|
|
||||||
* @param pos Position of character to delete
|
|
||||||
* @param len length of input buffer
|
|
||||||
* @return length of input buffer after deletion
|
|
||||||
*/
|
|
||||||
protected int delete(char s[], int pos, int len) {
|
|
||||||
if (pos < len)
|
|
||||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
|
||||||
|
|
||||||
return len - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.bg;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Light Stemmer for Bulgarian.
|
* Light Stemmer for Bulgarian.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -138,15 +140,4 @@ public class BulgarianStemmer {
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean endsWith(final char s[], final int len, final String suffix) {
|
|
||||||
final int suffixLen = suffix.length();
|
|
||||||
if (suffixLen > len)
|
|
||||||
return false;
|
|
||||||
for (int i = suffixLen - 1; i >= 0; i--)
|
|
||||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.cz;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Light Stemmer for Czech.
|
* Light Stemmer for Czech.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -166,16 +168,4 @@ public class CzechStemmer {
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean endsWith(char s[], int len, String suffix) {
|
|
||||||
int suffixLen = suffix.length();
|
|
||||||
if (suffixLen > len)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (int i = suffixLen - 1; i >= 0; i--)
|
|
||||||
if (s[len - (suffixLen - i)] != suffix.charAt(i))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link GermanLightStemmer} to stem German
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class GermanLightStemFilter extends TokenFilter {
|
||||||
|
private final GermanLightStemmer stemmer = new GermanLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public GermanLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,138 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for German.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the "UniNE" algorithm in:
|
||||||
|
* <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class GermanLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'ä':
|
||||||
|
case 'à':
|
||||||
|
case 'á':
|
||||||
|
case 'â': s[i] = 'a'; break;
|
||||||
|
case 'ö':
|
||||||
|
case 'ò':
|
||||||
|
case 'ó':
|
||||||
|
case 'ô': s[i] = 'o'; break;
|
||||||
|
case 'ï':
|
||||||
|
case 'ì':
|
||||||
|
case 'í':
|
||||||
|
case 'î': s[i] = 'i'; break;
|
||||||
|
case 'ü':
|
||||||
|
case 'ù':
|
||||||
|
case 'ú':
|
||||||
|
case 'û': s[i] = 'u'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = step1(s, len);
|
||||||
|
return step2(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean stEnding(char ch) {
|
||||||
|
switch(ch) {
|
||||||
|
case 'b':
|
||||||
|
case 'd':
|
||||||
|
case 'f':
|
||||||
|
case 'g':
|
||||||
|
case 'h':
|
||||||
|
case 'k':
|
||||||
|
case 'l':
|
||||||
|
case 'm':
|
||||||
|
case 'n':
|
||||||
|
case 't': return true;
|
||||||
|
default: return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int step1(char s[], int len) {
|
||||||
|
if (len > 5 && s[len-3] == 'e' && s[len-2] == 'r' && s[len-1] == 'n')
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 4 && s[len-2] == 'e')
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'm':
|
||||||
|
case 'n':
|
||||||
|
case 'r':
|
||||||
|
case 's': return len - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 3 && s[len-1] == 'e')
|
||||||
|
return len - 1;
|
||||||
|
|
||||||
|
if (len > 3 && s[len-1] == 's' && stEnding(s[len-2]))
|
||||||
|
return len - 1;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int step2(char s[], int len) {
|
||||||
|
if (len > 5 && s[len-3] == 'e' && s[len-2] == 's' && s[len-1] == 't')
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 4 && s[len-2] == 'e' && (s[len-1] == 'r' || s[len-1] == 'n'))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (len > 4 && s[len-2] == 's' && s[len-1] == 't' && stEnding(s[len-3]))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link GermanMinimalStemmer} to stem German
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class GermanMinimalStemFilter extends TokenFilter {
|
||||||
|
private final GermanMinimalStemmer stemmer = new GermanMinimalStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public GermanMinimalStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,95 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal Stemmer for German.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the following algorithm:
|
||||||
|
* <i>Morphologie et recherche d'information</i>
|
||||||
|
* Jacques Savoy.
|
||||||
|
*/
|
||||||
|
public class GermanMinimalStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 5)
|
||||||
|
return len;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'ä': s[i] = 'a'; break;
|
||||||
|
case 'ö': s[i] = 'o'; break;
|
||||||
|
case 'ü': s[i] = 'u'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6 && s[len-3] == 'n' && s[len-2] == 'e' && s[len-1] == 'n')
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 5)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'n': if (s[len-2] == 'e') return len - 2; else break;
|
||||||
|
case 'e': if (s[len-2] == 's') return len - 2; else break;
|
||||||
|
case 's': if (s[len-2] == 'e') return len - 2; else break;
|
||||||
|
case 'r': if (s[len-2] == 'e') return len - 2; else break;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'n':
|
||||||
|
case 'e':
|
||||||
|
case 's':
|
||||||
|
case 'r': return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.en;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link EnglishMinimalStemmer} to stem
|
||||||
|
* English words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class EnglishMinimalStemFilter extends TokenFilter {
|
||||||
|
private final EnglishMinimalStemmer stemmer = new EnglishMinimalStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public EnglishMinimalStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,45 @@
|
||||||
|
package org.apache.lucene.analysis.en;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal plural stemmer for English.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the "S-Stemmer" from
|
||||||
|
* <i>How Effective Is Suffixing?</i>
|
||||||
|
* Donna Harman.
|
||||||
|
*/
|
||||||
|
public class EnglishMinimalStemmer {
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 3 || s[len-1] != 's')
|
||||||
|
return len;
|
||||||
|
|
||||||
|
switch(s[len-2]) {
|
||||||
|
case 'u':
|
||||||
|
case 's': return len;
|
||||||
|
case 'e':
|
||||||
|
if (len > 3 && s[len-3] == 'i' && s[len-4] != 'a' && s[len-4] != 'e') {
|
||||||
|
s[len - 3] = 'y';
|
||||||
|
return len - 2;
|
||||||
|
}
|
||||||
|
if (s[len-3] == 'i' || s[len-3] == 'a' || s[len-3] == 'o' || s[len-3] == 'e')
|
||||||
|
return len;
|
||||||
|
default: return len - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link SpanishLightStemmer} to stem Spanish
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class SpanishLightStemFilter extends TokenFilter {
|
||||||
|
private final SpanishLightStemmer stemmer = new SpanishLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public SpanishLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,109 @@
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Spanish
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the algorithm described in:
|
||||||
|
* <i>Report on CLEF-2001 Experiments</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class SpanishLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 5)
|
||||||
|
return len;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'à':
|
||||||
|
case 'á':
|
||||||
|
case 'â':
|
||||||
|
case 'ä': s[i] = 'a'; break;
|
||||||
|
case 'ò':
|
||||||
|
case 'ó':
|
||||||
|
case 'ô':
|
||||||
|
case 'ö': s[i] = 'o'; break;
|
||||||
|
case 'è':
|
||||||
|
case 'é':
|
||||||
|
case 'ê':
|
||||||
|
case 'ë': s[i] = 'e'; break;
|
||||||
|
case 'ù':
|
||||||
|
case 'ú':
|
||||||
|
case 'û':
|
||||||
|
case 'ü': s[i] = 'u'; break;
|
||||||
|
case 'ì':
|
||||||
|
case 'í':
|
||||||
|
case 'î':
|
||||||
|
case 'ï': s[i] = 'i'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'o':
|
||||||
|
case 'a':
|
||||||
|
case 'e': return len - 1;
|
||||||
|
case 's':
|
||||||
|
if (s[len-2] == 'e' && s[len-3] == 's' && s[len-4] == 'e')
|
||||||
|
return len-2;
|
||||||
|
if (s[len-2] == 'e' && s[len-3] == 'c') {
|
||||||
|
s[len-3] = 'z';
|
||||||
|
return len - 2;
|
||||||
|
}
|
||||||
|
if (s[len-2] == 'o' || s[len-2] == 'a' || s[len-2] == 'e')
|
||||||
|
return len - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.fa;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizer for Persian.
|
* Normalizer for Persian.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -82,20 +84,4 @@ public class PersianNormalizer {
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete a character in-place
|
|
||||||
*
|
|
||||||
* @param s Input Buffer
|
|
||||||
* @param pos Position of character to delete
|
|
||||||
* @param len length of input buffer
|
|
||||||
* @return length of input buffer after deletion
|
|
||||||
*/
|
|
||||||
protected int delete(char s[], int pos, int len) {
|
|
||||||
if (pos < len)
|
|
||||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
|
||||||
|
|
||||||
return len - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.fi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link FinnishLightStemmer} to stem Finnish
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class FinnishLightStemFilter extends TokenFilter {
|
||||||
|
private final FinnishLightStemmer stemmer = new FinnishLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public FinnishLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,259 @@
|
||||||
|
package org.apache.lucene.analysis.fi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Finnish.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the algorithm described in:
|
||||||
|
* <i>Report on CLEF-2003 Monolingual Tracks</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class FinnishLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 4)
|
||||||
|
return len;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'ä':
|
||||||
|
case 'å': s[i] = 'a'; break;
|
||||||
|
case 'ö': s[i] = 'o'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = step1(s, len);
|
||||||
|
len = step2(s, len);
|
||||||
|
len = step3(s, len);
|
||||||
|
len = norm1(s, len);
|
||||||
|
len = norm2(s, len);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int step1(char s[], int len) {
|
||||||
|
if (len > 8) {
|
||||||
|
if (endsWith(s, len, "kin"))
|
||||||
|
return step1(s, len-3);
|
||||||
|
if (endsWith(s, len, "ko"))
|
||||||
|
return step1(s, len-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 11) {
|
||||||
|
if (endsWith(s, len, "dellinen"))
|
||||||
|
return len-8;
|
||||||
|
if (endsWith(s, len, "dellisuus"))
|
||||||
|
return len-9;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int step2(char s[], int len) {
|
||||||
|
if (len > 5) {
|
||||||
|
if (endsWith(s, len, "lla")
|
||||||
|
|| endsWith(s, len, "tse")
|
||||||
|
|| endsWith(s, len, "sti"))
|
||||||
|
return len-3;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ni"))
|
||||||
|
return len-2;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "aa"))
|
||||||
|
return len-1; // aa -> a
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int step3(char s[], int len) {
|
||||||
|
if (len > 8) {
|
||||||
|
if (endsWith(s, len, "nnen")) {
|
||||||
|
s[len-4] = 's';
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ntena")) {
|
||||||
|
s[len-5] = 's';
|
||||||
|
return len-4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "tten"))
|
||||||
|
return len-4;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "eiden"))
|
||||||
|
return len-5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6) {
|
||||||
|
if (endsWith(s, len, "neen")
|
||||||
|
|| endsWith(s, len, "niin")
|
||||||
|
|| endsWith(s, len, "seen")
|
||||||
|
|| endsWith(s, len, "teen")
|
||||||
|
|| endsWith(s, len, "inen"))
|
||||||
|
return len-4;
|
||||||
|
|
||||||
|
if (s[len-3] == 'h' && isVowel(s[len-2]) && s[len-1] == 'n')
|
||||||
|
return len-3;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "den")) {
|
||||||
|
s[len-3] = 's';
|
||||||
|
return len-2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ksen")) {
|
||||||
|
s[len-4] = 's';
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ssa")
|
||||||
|
|| endsWith(s, len, "sta")
|
||||||
|
|| endsWith(s, len, "lla")
|
||||||
|
|| endsWith(s, len, "lta")
|
||||||
|
|| endsWith(s, len, "tta")
|
||||||
|
|| endsWith(s, len, "ksi")
|
||||||
|
|| endsWith(s, len, "lle"))
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 5) {
|
||||||
|
if (endsWith(s, len, "na")
|
||||||
|
|| endsWith(s, len, "ne"))
|
||||||
|
return len-2;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "nei"))
|
||||||
|
return len-3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (endsWith(s, len, "ja")
|
||||||
|
|| endsWith(s, len, "ta"))
|
||||||
|
return len-2;
|
||||||
|
|
||||||
|
if (s[len-1] == 'a')
|
||||||
|
return len-1;
|
||||||
|
|
||||||
|
if (s[len-1] == 'n' && isVowel(s[len-2]))
|
||||||
|
return len-2;
|
||||||
|
|
||||||
|
if (s[len-1] == 'n')
|
||||||
|
return len-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int norm1(char s[], int len) {
|
||||||
|
if (len > 5 && endsWith(s, len, "hde")) {
|
||||||
|
s[len-3] = 'k';
|
||||||
|
s[len-2] = 's';
|
||||||
|
s[len-1] = 'i';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (endsWith(s, len, "ei") || endsWith(s, len, "at"))
|
||||||
|
return len-2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 3)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 't':
|
||||||
|
case 's':
|
||||||
|
case 'j':
|
||||||
|
case 'e':
|
||||||
|
case 'a':
|
||||||
|
case 'i': return len-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int norm2(char s[], int len) {
|
||||||
|
if (len > 8) {
|
||||||
|
if (s[len-1] == 'e'
|
||||||
|
|| s[len-1] == 'o'
|
||||||
|
|| s[len-1] == 'u')
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (s[len-1] == 'i')
|
||||||
|
len--;
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
char ch = s[0];
|
||||||
|
for (int i = 1; i < len; i++) {
|
||||||
|
if (s[i] == ch &&
|
||||||
|
(ch == 'k' || ch == 'p' || ch == 't'))
|
||||||
|
len = delete(s, i--, len);
|
||||||
|
else
|
||||||
|
ch = s[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isVowel(char ch) {
|
||||||
|
switch(ch) {
|
||||||
|
case 'a':
|
||||||
|
case 'e':
|
||||||
|
case 'i':
|
||||||
|
case 'o':
|
||||||
|
case 'u':
|
||||||
|
case 'y': return true;
|
||||||
|
default: return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link FrenchLightStemmer} to stem French
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class FrenchLightStemFilter extends TokenFilter {
|
||||||
|
private final FrenchLightStemmer stemmer = new FrenchLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public FrenchLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,267 @@
|
||||||
|
package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for French.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the "UniNE" algorithm in:
|
||||||
|
* <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class FrenchLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len > 5 && s[len-1] == 'x') {
|
||||||
|
if (s[len-3] == 'a' && s[len-2] == 'u' && s[len-4] != 'e')
|
||||||
|
s[len-2] = 'l';
|
||||||
|
len--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 3 && s[len-1] == 'x')
|
||||||
|
len--;
|
||||||
|
|
||||||
|
if (len > 3 && s[len-1] == 's')
|
||||||
|
len--;
|
||||||
|
|
||||||
|
if (len > 9 && endsWith(s, len, "issement")) {
|
||||||
|
len -= 6;
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "issant")) {
|
||||||
|
len -= 4;
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6 && endsWith(s, len, "ement")) {
|
||||||
|
len -= 4;
|
||||||
|
if (len > 3 && endsWith(s, len, "ive")) {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'f';
|
||||||
|
}
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 11 && endsWith(s, len, "ficatrice")) {
|
||||||
|
len -= 5;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 10 && endsWith(s, len, "ficateur")) {
|
||||||
|
len -= 4;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 9 && endsWith(s, len, "catrice")) {
|
||||||
|
len -= 3;
|
||||||
|
s[len-4] = 'q';
|
||||||
|
s[len-3] = 'u';
|
||||||
|
s[len-2] = 'e';
|
||||||
|
//s[len-1] = 'r' <-- unnecessary, already 'r'.
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "cateur")) {
|
||||||
|
len -= 2;
|
||||||
|
s[len-4] = 'q';
|
||||||
|
s[len-3] = 'u';
|
||||||
|
s[len-2] = 'e';
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "atrice")) {
|
||||||
|
len -= 4;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 7 && endsWith(s, len, "ateur")) {
|
||||||
|
len -= 3;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6 && endsWith(s, len, "trice")) {
|
||||||
|
len--;
|
||||||
|
s[len-3] = 'e';
|
||||||
|
s[len-2] = 'u';
|
||||||
|
s[len-1] = 'r';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 5 && endsWith(s, len, "ième"))
|
||||||
|
return norm(s, len-4);
|
||||||
|
|
||||||
|
if (len > 7 && endsWith(s, len, "teuse")) {
|
||||||
|
len -= 2;
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6 && endsWith(s, len, "teur")) {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'r';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 5 && endsWith(s, len, "euse"))
|
||||||
|
return norm(s, len-2);
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "ère")) {
|
||||||
|
len--;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 7 && endsWith(s, len, "ive")) {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'f';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 &&
|
||||||
|
(endsWith(s, len, "folle") ||
|
||||||
|
endsWith(s, len, "molle"))) {
|
||||||
|
len -= 2;
|
||||||
|
s[len-1] = 'u';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 9 && endsWith(s, len, "nnelle"))
|
||||||
|
return norm(s, len-5);
|
||||||
|
|
||||||
|
if (len > 9 && endsWith(s, len, "nnel"))
|
||||||
|
return norm(s, len-3);
|
||||||
|
|
||||||
|
if (len > 4 && endsWith(s, len, "ète")) {
|
||||||
|
len--;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "ique"))
|
||||||
|
len -= 4;
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "esse"))
|
||||||
|
return norm(s, len-3);
|
||||||
|
|
||||||
|
if (len > 7 && endsWith(s, len, "inage"))
|
||||||
|
return norm(s, len-3);
|
||||||
|
|
||||||
|
if (len > 9 && endsWith(s, len, "isation")) {
|
||||||
|
len -= 7;
|
||||||
|
if (len > 5 && endsWith(s, len, "ual"))
|
||||||
|
s[len-2] = 'e';
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 9 && endsWith(s, len, "isateur"))
|
||||||
|
return norm(s, len-7);
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "ation"))
|
||||||
|
return norm(s, len-5);
|
||||||
|
|
||||||
|
if (len > 8 && endsWith(s, len, "ition"))
|
||||||
|
return norm(s, len-5);
|
||||||
|
|
||||||
|
return norm(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int norm(char s[], int len) {
|
||||||
|
if (len > 4) {
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'à':
|
||||||
|
case 'á':
|
||||||
|
case 'â': s[i] = 'a'; break;
|
||||||
|
case 'ô': s[i] = 'o'; break;
|
||||||
|
case 'è':
|
||||||
|
case 'é':
|
||||||
|
case 'ê': s[i] = 'e'; break;
|
||||||
|
case 'ù':
|
||||||
|
case 'û': s[i] = 'u'; break;
|
||||||
|
case 'î': s[i] = 'i'; break;
|
||||||
|
case 'ç': s[i] = 'c'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
char ch = s[0];
|
||||||
|
for (int i = 1; i < len; i++) {
|
||||||
|
if (s[i] == ch)
|
||||||
|
len = delete(s, i--, len);
|
||||||
|
else
|
||||||
|
ch = s[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 && endsWith(s, len, "ie"))
|
||||||
|
len -= 2;
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (s[len-1] == 'r') len--;
|
||||||
|
if (s[len-1] == 'e') len--;
|
||||||
|
if (s[len-1] == 'e') len--;
|
||||||
|
if (s[len-1] == s[len-2]) len--;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link FrenchMinimalStemmer} to stem French
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class FrenchMinimalStemFilter extends TokenFilter {
|
||||||
|
private final FrenchMinimalStemmer stemmer = new FrenchMinimalStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public FrenchMinimalStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for French.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the following algorithm:
|
||||||
|
* <i>A Stemming procedure and stopword list for general French corpora.</i>
|
||||||
|
* Jacques Savoy.
|
||||||
|
*/
|
||||||
|
public class FrenchMinimalStemmer {
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 6)
|
||||||
|
return len;
|
||||||
|
|
||||||
|
if (s[len-1] == 'x') {
|
||||||
|
if (s[len-3] == 'a' && s[len-2] == 'u')
|
||||||
|
s[len-2] = 'l';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s[len-1] == 's') len--;
|
||||||
|
if (s[len-1] == 'r') len--;
|
||||||
|
if (s[len-1] == 'e') len--;
|
||||||
|
if (s[len-1] == 'é') len--;
|
||||||
|
if (s[len-1] == s[len-2]) len--;
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizer for Hindi.
|
* Normalizer for Hindi.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -176,19 +178,4 @@ public class HindiNormalizer {
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete a character in-place
|
|
||||||
*
|
|
||||||
* @param s Input Buffer
|
|
||||||
* @param pos Position of character to delete
|
|
||||||
* @param len length of input buffer
|
|
||||||
* @return length of input buffer after deletion
|
|
||||||
*/
|
|
||||||
protected int delete(char s[], int pos, int len) {
|
|
||||||
if (pos < len)
|
|
||||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
|
||||||
|
|
||||||
return len - 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hi;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Light Stemmer for Hindi.
|
* Light Stemmer for Hindi.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -116,15 +118,4 @@ public class HindiStemmer {
|
||||||
return len - 1;
|
return len - 1;
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean endsWith(final char s[], final int len, final String suffix) {
|
|
||||||
final int suffixLen = suffix.length();
|
|
||||||
if (suffixLen > len)
|
|
||||||
return false;
|
|
||||||
for (int i = suffixLen - 1; i >= 0; i--)
|
|
||||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.hu;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link HungarianLightStemmer} to stem
|
||||||
|
* Hungarian words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class HungarianLightStemFilter extends TokenFilter {
|
||||||
|
private final HungarianLightStemmer stemmer = new HungarianLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public HungarianLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,238 @@
|
||||||
|
package org.apache.lucene.analysis.hu;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Hungarian.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the "UniNE" algorithm in:
|
||||||
|
* <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class HungarianLightStemmer {
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'á': s[i] = 'a'; break;
|
||||||
|
case 'ë':
|
||||||
|
case 'é': s[i] = 'e'; break;
|
||||||
|
case 'í': s[i] = 'i'; break;
|
||||||
|
case 'ó':
|
||||||
|
case 'ő':
|
||||||
|
case 'õ':
|
||||||
|
case 'ö': s[i] = 'o'; break;
|
||||||
|
case 'ú':
|
||||||
|
case 'ű':
|
||||||
|
case 'ũ':
|
||||||
|
case 'û':
|
||||||
|
case 'ü': s[i] = 'u'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = removeCase(s, len);
|
||||||
|
len = removePossessive(s, len);
|
||||||
|
len = removePlural(s, len);
|
||||||
|
return normalize(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeCase(char s[], int len) {
|
||||||
|
if (len > 6 && endsWith(s, len, "kent"))
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
|
if (len > 5) {
|
||||||
|
if (endsWith(s, len, "nak") ||
|
||||||
|
endsWith(s, len, "nek") ||
|
||||||
|
endsWith(s, len, "val") ||
|
||||||
|
endsWith(s, len, "vel") ||
|
||||||
|
endsWith(s, len, "ert") ||
|
||||||
|
endsWith(s, len, "rol") ||
|
||||||
|
endsWith(s, len, "ban") ||
|
||||||
|
endsWith(s, len, "ben") ||
|
||||||
|
endsWith(s, len, "bol") ||
|
||||||
|
endsWith(s, len, "nal") ||
|
||||||
|
endsWith(s, len, "nel") ||
|
||||||
|
endsWith(s, len, "hoz") ||
|
||||||
|
endsWith(s, len, "hez") ||
|
||||||
|
endsWith(s, len, "tol"))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "al") || endsWith(s, len, "el")) {
|
||||||
|
if (!isVowel(s[len-3]) && s[len-3] == s[len-4])
|
||||||
|
return len - 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (endsWith(s, len, "at") ||
|
||||||
|
endsWith(s, len, "et") ||
|
||||||
|
endsWith(s, len, "ot") ||
|
||||||
|
endsWith(s, len, "va") ||
|
||||||
|
endsWith(s, len, "ve") ||
|
||||||
|
endsWith(s, len, "ra") ||
|
||||||
|
endsWith(s, len, "re") ||
|
||||||
|
endsWith(s, len, "ba") ||
|
||||||
|
endsWith(s, len, "be") ||
|
||||||
|
endsWith(s, len, "ul") ||
|
||||||
|
endsWith(s, len, "ig"))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if ((endsWith(s, len, "on") || endsWith(s, len, "en")) && !isVowel(s[len-3]))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 't':
|
||||||
|
case 'n': return len - 1;
|
||||||
|
case 'a':
|
||||||
|
case 'e': if (s[len-2] == s[len-3] && !isVowel(s[len-2])) return len - 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removePossessive(char s[], int len) {
|
||||||
|
if (len > 6) {
|
||||||
|
if (!isVowel(s[len-5]) &&
|
||||||
|
(endsWith(s, len, "atok") ||
|
||||||
|
endsWith(s, len, "otok") ||
|
||||||
|
endsWith(s, len, "etek")))
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "itek") || endsWith(s, len, "itok"))
|
||||||
|
return len - 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 5) {
|
||||||
|
if (!isVowel(s[len-4]) &&
|
||||||
|
(endsWith(s, len, "unk") ||
|
||||||
|
endsWith(s, len, "tok") ||
|
||||||
|
endsWith(s, len, "tek")))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (isVowel(s[len-4]) && endsWith(s, len, "juk"))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ink"))
|
||||||
|
return len - 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4) {
|
||||||
|
if (!isVowel(s[len-3]) &&
|
||||||
|
(endsWith(s, len, "am") ||
|
||||||
|
endsWith(s, len, "em") ||
|
||||||
|
endsWith(s, len, "om") ||
|
||||||
|
endsWith(s, len, "ad") ||
|
||||||
|
endsWith(s, len, "ed") ||
|
||||||
|
endsWith(s, len, "od") ||
|
||||||
|
endsWith(s, len, "uk")))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (isVowel(s[len-3]) &&
|
||||||
|
(endsWith(s, len, "nk") ||
|
||||||
|
endsWith(s, len, "ja") ||
|
||||||
|
endsWith(s, len, "je")))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "im") ||
|
||||||
|
endsWith(s, len, "id") ||
|
||||||
|
endsWith(s, len, "ik"))
|
||||||
|
return len - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 3)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'a':
|
||||||
|
case 'e': if (!isVowel(s[len-2])) return len - 1; break;
|
||||||
|
case 'm':
|
||||||
|
case 'd': if (isVowel(s[len-2])) return len - 1; break;
|
||||||
|
case 'i': return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removePlural(char s[], int len) {
|
||||||
|
if (len > 3 && s[len-1] == 'k')
|
||||||
|
switch(s[len-2]) {
|
||||||
|
case 'a':
|
||||||
|
case 'o':
|
||||||
|
case 'e': if (len > 4) return len - 2; /* intentional fallthru */
|
||||||
|
default: return len - 1;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int normalize(char s[], int len) {
|
||||||
|
if (len > 3)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'a':
|
||||||
|
case 'e':
|
||||||
|
case 'i':
|
||||||
|
case 'o': return len - 1;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isVowel(char ch) {
|
||||||
|
switch(ch) {
|
||||||
|
case 'a':
|
||||||
|
case 'e':
|
||||||
|
case 'i':
|
||||||
|
case 'o':
|
||||||
|
case 'u':
|
||||||
|
case 'y': return true;
|
||||||
|
default: return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.id;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stemmer for Indonesian.
|
* Stemmer for Indonesian.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -266,39 +268,5 @@ public class IndonesianStemmer {
|
||||||
return length - 1;
|
return length - 1;
|
||||||
}
|
}
|
||||||
return length;
|
return length;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean startsWith(char s[], int len, String prefix) {
|
|
||||||
final int prefixLen = prefix.length();
|
|
||||||
if (prefixLen > len)
|
|
||||||
return false;
|
|
||||||
for (int i = 0; i < prefixLen; i++)
|
|
||||||
if (s[i] != prefix.charAt(i))
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean endsWith(char s[], int len, String suffix) {
|
|
||||||
final int suffixLen = suffix.length();
|
|
||||||
if (suffixLen > len)
|
|
||||||
return false;
|
|
||||||
for (int i = suffixLen - 1; i >= 0; i--)
|
|
||||||
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int deleteN(char s[], int pos, int len, int nChars) {
|
|
||||||
for (int i = 0; i < nChars; i++)
|
|
||||||
len = delete(s, pos, len);
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int delete(char s[], int pos, int len) {
|
|
||||||
if (pos < len)
|
|
||||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
|
||||||
|
|
||||||
return len - 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.in;
|
||||||
import java.util.BitSet;
|
import java.util.BitSet;
|
||||||
import java.util.IdentityHashMap;
|
import java.util.IdentityHashMap;
|
||||||
import static java.lang.Character.UnicodeBlock.*;
|
import static java.lang.Character.UnicodeBlock.*;
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes the Unicode representation of text in Indian languages.
|
* Normalizes the Unicode representation of text in Indian languages.
|
||||||
|
@ -290,14 +291,4 @@ public class IndicNormalizer {
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Delete a character in-place
|
|
||||||
*/
|
|
||||||
private int delete(char s[], int pos, int len) {
|
|
||||||
if (pos < len)
|
|
||||||
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
|
||||||
|
|
||||||
return len - 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.it;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link ItalianLightStemmer} to stem Italian
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class ItalianLightStemFilter extends TokenFilter {
|
||||||
|
private final ItalianLightStemmer stemmer = new ItalianLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public ItalianLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,117 @@
|
||||||
|
package org.apache.lucene.analysis.it;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Italian.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the algorithm described in:
|
||||||
|
* <i>Report on CLEF-2001 Experiments</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class ItalianLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 6)
|
||||||
|
return len;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'à':
|
||||||
|
case 'á':
|
||||||
|
case 'â':
|
||||||
|
case 'ä': s[i] = 'a'; break;
|
||||||
|
case 'ò':
|
||||||
|
case 'ó':
|
||||||
|
case 'ô':
|
||||||
|
case 'ö': s[i] = 'o'; break;
|
||||||
|
case 'è':
|
||||||
|
case 'é':
|
||||||
|
case 'ê':
|
||||||
|
case 'ë': s[i] = 'e'; break;
|
||||||
|
case 'ù':
|
||||||
|
case 'ú':
|
||||||
|
case 'û':
|
||||||
|
case 'ü': s[i] = 'u'; break;
|
||||||
|
case 'ì':
|
||||||
|
case 'í':
|
||||||
|
case 'î':
|
||||||
|
case 'ï': s[i] = 'i'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'e':
|
||||||
|
if (s[len-2] == 'i' || s[len-2] == 'h')
|
||||||
|
return len - 2;
|
||||||
|
else
|
||||||
|
return len - 1;
|
||||||
|
case 'i':
|
||||||
|
if (s[len-2] == 'h' || s[len-2] == 'i')
|
||||||
|
return len - 2;
|
||||||
|
else
|
||||||
|
return len - 1;
|
||||||
|
case 'a':
|
||||||
|
if (s[len-2] == 'i')
|
||||||
|
return len - 2;
|
||||||
|
else
|
||||||
|
return len - 1;
|
||||||
|
case 'o':
|
||||||
|
if (s[len-2] == 'i')
|
||||||
|
return len - 2;
|
||||||
|
else
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link PortugueseLightStemmer} to stem
|
||||||
|
* Portuguese words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class PortugueseLightStemFilter extends TokenFilter {
|
||||||
|
private final PortugueseLightStemmer stemmer = new PortugueseLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public PortugueseLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,202 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Portuguese
|
||||||
|
*/
|
||||||
|
public class PortugueseLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 4)
|
||||||
|
return len;
|
||||||
|
|
||||||
|
len = removeSuffix(s, len);
|
||||||
|
|
||||||
|
if (len > 3 && s[len-1] == 'a')
|
||||||
|
len = normFeminine(s, len);
|
||||||
|
|
||||||
|
if (len > 4)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'e':
|
||||||
|
case 'a':
|
||||||
|
case 'o': len--; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < len; i++)
|
||||||
|
switch(s[i]) {
|
||||||
|
case 'à':
|
||||||
|
case 'á':
|
||||||
|
case 'â':
|
||||||
|
case 'ä':
|
||||||
|
case 'ã': s[i] = 'a'; break;
|
||||||
|
case 'ò':
|
||||||
|
case 'ó':
|
||||||
|
case 'ô':
|
||||||
|
case 'ö':
|
||||||
|
case 'õ': s[i] = 'o'; break;
|
||||||
|
case 'è':
|
||||||
|
case 'é':
|
||||||
|
case 'ê':
|
||||||
|
case 'ë': s[i] = 'e'; break;
|
||||||
|
case 'ù':
|
||||||
|
case 'ú':
|
||||||
|
case 'û':
|
||||||
|
case 'ü': s[i] = 'u'; break;
|
||||||
|
case 'ì':
|
||||||
|
case 'í':
|
||||||
|
case 'î':
|
||||||
|
case 'ï': s[i] = 'i'; break;
|
||||||
|
case 'ç': s[i] = 'c'; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeSuffix(char s[], int len) {
|
||||||
|
if (len > 4 && endsWith(s, len, "es"))
|
||||||
|
switch(s[len-3]) {
|
||||||
|
case 'r':
|
||||||
|
case 's':
|
||||||
|
case 'l':
|
||||||
|
case 'z': return len - 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 3 && endsWith(s, len, "ns")) {
|
||||||
|
s[len - 2] = 'm';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 && (endsWith(s, len, "eis") || endsWith(s, len, "éis"))) {
|
||||||
|
s[len - 3] = 'e';
|
||||||
|
s[len - 2] = 'l';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 && endsWith(s, len, "ais")) {
|
||||||
|
s[len - 2] = 'l';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 && endsWith(s, len, "óis")) {
|
||||||
|
s[len - 3] = 'o';
|
||||||
|
s[len - 2] = 'l';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 4 && endsWith(s, len, "is")) {
|
||||||
|
s[len - 1] = 'l';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 3 &&
|
||||||
|
(endsWith(s, len, "ões") ||
|
||||||
|
endsWith(s, len, "ães"))) {
|
||||||
|
len--;
|
||||||
|
s[len - 2] = 'ã';
|
||||||
|
s[len - 1] = 'o';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6 && endsWith(s, len, "mente"))
|
||||||
|
return len - 5;
|
||||||
|
|
||||||
|
if (len > 3 && s[len-1] == 's')
|
||||||
|
return len - 1;
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int normFeminine(char s[], int len) {
|
||||||
|
if (len > 7 &&
|
||||||
|
(endsWith(s, len, "inha") ||
|
||||||
|
endsWith(s, len, "iaca") ||
|
||||||
|
endsWith(s, len, "eira"))) {
|
||||||
|
s[len - 1] = 'o';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 6) {
|
||||||
|
if (endsWith(s, len, "osa") ||
|
||||||
|
endsWith(s, len, "ica") ||
|
||||||
|
endsWith(s, len, "ida") ||
|
||||||
|
endsWith(s, len, "ada") ||
|
||||||
|
endsWith(s, len, "iva") ||
|
||||||
|
endsWith(s, len, "ama")) {
|
||||||
|
s[len - 1] = 'o';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ona")) {
|
||||||
|
s[len - 3] = 'ã';
|
||||||
|
s[len - 2] = 'o';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "ora"))
|
||||||
|
return len - 1;
|
||||||
|
|
||||||
|
if (endsWith(s, len, "esa")) {
|
||||||
|
s[len - 3] = 'ê';
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (endsWith(s, len, "na")) {
|
||||||
|
s[len - 1] = 'o';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link PortugueseMinimalStemmer} to stem
|
||||||
|
* Portuguese words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class PortugueseMinimalStemFilter extends TokenFilter {
|
||||||
|
private final PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public PortugueseMinimalStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Minimal Stemmer for Portuguese
|
||||||
|
* <p>
|
||||||
|
* This follows the "RSLP-S" algorithm presented in:
|
||||||
|
* <i>A study on the Use of Stemming for Monolingual Ad-Hoc Portuguese
|
||||||
|
* Information Retrieval</i> (Orengo, et al)
|
||||||
|
* which is just the plural reduction step of the RSLP
|
||||||
|
* algorithm from <i>A Stemming Algorithmm for the Portuguese Language</i>,
|
||||||
|
* Orengo et al.
|
||||||
|
*/
|
||||||
|
public class PortugueseMinimalStemmer {
|
||||||
|
|
||||||
|
private static final CharArraySet excIS = new CharArraySet(Version.LUCENE_31,
|
||||||
|
Arrays.asList("lápis", "cais", "mais", "crúcis", "biquínis", "pois",
|
||||||
|
"depois","dois","leis"),
|
||||||
|
false);
|
||||||
|
|
||||||
|
private static final CharArraySet excS = new CharArraySet(Version.LUCENE_31,
|
||||||
|
Arrays.asList("aliás", "pires", "lápis", "cais", "mais", "mas", "menos",
|
||||||
|
"férias", "fezes", "pêsames", "crúcis", "gás", "atrás", "moisés",
|
||||||
|
"através", "convés", "ês", "país", "após", "ambas", "ambos",
|
||||||
|
"messias", "depois"),
|
||||||
|
false);
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len < 3 || s[len-1] != 's')
|
||||||
|
return len;
|
||||||
|
|
||||||
|
if (s[len-2] == 'n') {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'm';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 6 && s[len-3] == 'õ' && s[len-2] == 'e') {
|
||||||
|
len--;
|
||||||
|
s[len-2] = 'ã';
|
||||||
|
s[len-1] = 'o';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 4 && s[len-3] == 'ã' && s[len-2] == 'e')
|
||||||
|
if (!(len == 4 && s[0] == 'm')) {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'o';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 4 && s[len-2] == 'i') {
|
||||||
|
if (s[len-3] == 'a')
|
||||||
|
if (!(len == 4 && (s[0] == 'c' || s[0] == 'm'))) {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 5 && s[len-3] == 'é') {
|
||||||
|
len--;
|
||||||
|
s[len-2] = 'e';
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 5 && s[len-3] == 'e') {
|
||||||
|
len--;
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 5 && s[len-3] == 'ó') {
|
||||||
|
len--;
|
||||||
|
s[len-2] = 'o';
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!excIS.contains(s, 0, len)) {
|
||||||
|
s[len-1] = 'l';
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len >= 6 && s[len-3] == 'l' && s[len-2] == 'e')
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (len >= 6 && s[len-3] == 'r' && s[len-2] == 'e')
|
||||||
|
if (!(len == 7 && s[0] == 'á' && s[1] == 'r' && s[2] == 'v' && s[3] == 'o'))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (excS.contains(s, 0, len))
|
||||||
|
return len;
|
||||||
|
else
|
||||||
|
return len-1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link RussianLightStemmer} to stem Russian
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class RussianLightStemFilter extends TokenFilter {
|
||||||
|
private final RussianLightStemmer stemmer = new RussianLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public RussianLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,153 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Russian.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the following algorithm:
|
||||||
|
* <i>Indexing and Searching Strategies for the Russian Language.</i>
|
||||||
|
* Ljiljana Dolamic and Jacques Savoy.
|
||||||
|
*/
|
||||||
|
public class RussianLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
len = removeCase(s, len);
|
||||||
|
return normalize(s, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int normalize(char s[], int len) {
|
||||||
|
if (len > 3)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'ь':
|
||||||
|
case 'и': return len - 1;
|
||||||
|
case 'н': if (s[len-2] == 'н') return len - 1;
|
||||||
|
}
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int removeCase(char s[], int len) {
|
||||||
|
if (len > 6 &&
|
||||||
|
(endsWith(s, len, "иями") ||
|
||||||
|
endsWith(s, len, "оями")))
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
|
if (len > 5 &&
|
||||||
|
(endsWith(s, len, "иям") ||
|
||||||
|
endsWith(s, len, "иях") ||
|
||||||
|
endsWith(s, len, "оях") ||
|
||||||
|
endsWith(s, len, "ями") ||
|
||||||
|
endsWith(s, len, "оям") ||
|
||||||
|
endsWith(s, len, "оьв") ||
|
||||||
|
endsWith(s, len, "ами") ||
|
||||||
|
endsWith(s, len, "его") ||
|
||||||
|
endsWith(s, len, "ему") ||
|
||||||
|
endsWith(s, len, "ери") ||
|
||||||
|
endsWith(s, len, "ими") ||
|
||||||
|
endsWith(s, len, "ого") ||
|
||||||
|
endsWith(s, len, "ому") ||
|
||||||
|
endsWith(s, len, "ыми") ||
|
||||||
|
endsWith(s, len, "оев")))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 4 &&
|
||||||
|
(endsWith(s, len, "ая") ||
|
||||||
|
endsWith(s, len, "яя") ||
|
||||||
|
endsWith(s, len, "ях") ||
|
||||||
|
endsWith(s, len, "юю") ||
|
||||||
|
endsWith(s, len, "ах") ||
|
||||||
|
endsWith(s, len, "ею") ||
|
||||||
|
endsWith(s, len, "их") ||
|
||||||
|
endsWith(s, len, "ия") ||
|
||||||
|
endsWith(s, len, "ию") ||
|
||||||
|
endsWith(s, len, "ьв") ||
|
||||||
|
endsWith(s, len, "ою") ||
|
||||||
|
endsWith(s, len, "ую") ||
|
||||||
|
endsWith(s, len, "ям") ||
|
||||||
|
endsWith(s, len, "ых") ||
|
||||||
|
endsWith(s, len, "ея") ||
|
||||||
|
endsWith(s, len, "ам") ||
|
||||||
|
endsWith(s, len, "ем") ||
|
||||||
|
endsWith(s, len, "ей") ||
|
||||||
|
endsWith(s, len, "ём") ||
|
||||||
|
endsWith(s, len, "ев") ||
|
||||||
|
endsWith(s, len, "ий") ||
|
||||||
|
endsWith(s, len, "им") ||
|
||||||
|
endsWith(s, len, "ое") ||
|
||||||
|
endsWith(s, len, "ой") ||
|
||||||
|
endsWith(s, len, "ом") ||
|
||||||
|
endsWith(s, len, "ов") ||
|
||||||
|
endsWith(s, len, "ые") ||
|
||||||
|
endsWith(s, len, "ый") ||
|
||||||
|
endsWith(s, len, "ым") ||
|
||||||
|
endsWith(s, len, "ми")))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (len > 3)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 'а':
|
||||||
|
case 'е':
|
||||||
|
case 'и':
|
||||||
|
case 'о':
|
||||||
|
case 'у':
|
||||||
|
case 'й':
|
||||||
|
case 'ы':
|
||||||
|
case 'я':
|
||||||
|
case 'ь': return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A {@link TokenFilter} that applies {@link SwedishLightStemmer} to stem Swedish
|
||||||
|
* words.
|
||||||
|
* <p>
|
||||||
|
* To prevent terms from being stemmed use an instance of
|
||||||
|
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||||
|
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public final class SwedishLightStemFilter extends TokenFilter {
|
||||||
|
private final SwedishLightStemmer stemmer = new SwedishLightStemmer();
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
|
public SwedishLightStemFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
if (!keywordAttr.isKeyword()) {
|
||||||
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
termAtt.setLength(newlen);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,111 @@
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This algorithm is updated based on code located at:
|
||||||
|
* http://members.unine.ch/jacques.savoy/clef/
|
||||||
|
*
|
||||||
|
* Full copyright for that code follows:
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005, Jacques Savoy
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* Redistributions of source code must retain the above copyright notice, this
|
||||||
|
* list of conditions and the following disclaimer. Redistributions in binary
|
||||||
|
* form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials
|
||||||
|
* provided with the distribution. Neither the name of the author nor the names
|
||||||
|
* of its contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.StemmerUtil.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Light Stemmer for Swedish.
|
||||||
|
* <p>
|
||||||
|
* This stemmer implements the algorithm described in:
|
||||||
|
* <i>Report on CLEF-2003 Monolingual Tracks</i>
|
||||||
|
* Jacques Savoy
|
||||||
|
*/
|
||||||
|
public class SwedishLightStemmer {
|
||||||
|
|
||||||
|
public int stem(char s[], int len) {
|
||||||
|
if (len > 4 && s[len-1] == 's')
|
||||||
|
len--;
|
||||||
|
|
||||||
|
if (len > 7 &&
|
||||||
|
(endsWith(s, len, "elser") ||
|
||||||
|
endsWith(s, len, "heten")))
|
||||||
|
return len - 5;
|
||||||
|
|
||||||
|
if (len > 6 &&
|
||||||
|
(endsWith(s, len, "arne") ||
|
||||||
|
endsWith(s, len, "erna") ||
|
||||||
|
endsWith(s, len, "ande") ||
|
||||||
|
endsWith(s, len, "else") ||
|
||||||
|
endsWith(s, len, "aste") ||
|
||||||
|
endsWith(s, len, "orna") ||
|
||||||
|
endsWith(s, len, "aren")))
|
||||||
|
return len - 4;
|
||||||
|
|
||||||
|
if (len > 5 &&
|
||||||
|
(endsWith(s, len, "are") ||
|
||||||
|
endsWith(s, len, "ast") ||
|
||||||
|
endsWith(s, len, "het")))
|
||||||
|
return len - 3;
|
||||||
|
|
||||||
|
if (len > 4 &&
|
||||||
|
(endsWith(s, len, "ar") ||
|
||||||
|
endsWith(s, len, "er") ||
|
||||||
|
endsWith(s, len, "or") ||
|
||||||
|
endsWith(s, len, "en") ||
|
||||||
|
endsWith(s, len, "at") ||
|
||||||
|
endsWith(s, len, "te") ||
|
||||||
|
endsWith(s, len, "et")))
|
||||||
|
return len - 2;
|
||||||
|
|
||||||
|
if (len > 3)
|
||||||
|
switch(s[len-1]) {
|
||||||
|
case 't':
|
||||||
|
case 'a':
|
||||||
|
case 'e':
|
||||||
|
case 'n': return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,89 @@
|
||||||
|
package org.apache.lucene.analysis.util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Some commonly-used stemming functions */
|
||||||
|
public class StemmerUtil {
|
||||||
|
/**
|
||||||
|
* Returns true if the character array starts with the suffix.
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @param suffix Suffix string to test
|
||||||
|
* @return true if <code>s</code> starts with <code>suffix</code>
|
||||||
|
*/
|
||||||
|
public static boolean startsWith(char s[], int len, String prefix) {
|
||||||
|
final int prefixLen = prefix.length();
|
||||||
|
if (prefixLen > len)
|
||||||
|
return false;
|
||||||
|
for (int i = 0; i < prefixLen; i++)
|
||||||
|
if (s[i] != prefix.charAt(i))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the character array ends with the suffix.
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @param suffix Suffix string to test
|
||||||
|
* @return true if <code>s</code> ends with <code>suffix</code>
|
||||||
|
*/
|
||||||
|
public static boolean endsWith(char s[], int len, String suffix) {
|
||||||
|
final int suffixLen = suffix.length();
|
||||||
|
if (suffixLen > len)
|
||||||
|
return false;
|
||||||
|
for (int i = suffixLen - 1; i >= 0; i--)
|
||||||
|
if (s[len -(suffixLen - i)] != suffix.charAt(i))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete a character in-place
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param pos Position of character to delete
|
||||||
|
* @param len length of input buffer
|
||||||
|
* @return length of input buffer after deletion
|
||||||
|
*/
|
||||||
|
public static int delete(char s[], int pos, int len) {
|
||||||
|
if (pos < len)
|
||||||
|
System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
|
||||||
|
|
||||||
|
return len - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete n characters in-place
|
||||||
|
*
|
||||||
|
* @param s Input Buffer
|
||||||
|
* @param pos Position of character to delete
|
||||||
|
* @param len Length of input buffer
|
||||||
|
* @param nChars number of characters to delete
|
||||||
|
* @return length of input buffer after deletion
|
||||||
|
*/
|
||||||
|
public static int deleteN(char s[], int pos, int len, int nChars) {
|
||||||
|
// TODO: speed up, this is silly
|
||||||
|
for (int i = 0; i < nChars; i++)
|
||||||
|
len = delete(s, pos, len);
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link GermanLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestGermanLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new GermanLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("delighttestdata.zip"), "delight.txt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.apache.lucene.analysis.de;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link GermanMinimalStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestGermanMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new GermanMinimalStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test some examples from the paper */
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
checkOneTerm(analyzer, "sängerinnen", "sangerin");
|
||||||
|
checkOneTerm(analyzer, "frauen", "frau");
|
||||||
|
checkOneTerm(analyzer, "kenntnisse", "kenntnis");
|
||||||
|
checkOneTerm(analyzer, "staates", "staat");
|
||||||
|
checkOneTerm(analyzer, "bilder", "bild");
|
||||||
|
checkOneTerm(analyzer, "boote", "boot");
|
||||||
|
checkOneTerm(analyzer, "götter", "gott");
|
||||||
|
checkOneTerm(analyzer, "äpfel", "apfel");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("deminimaltestdata.zip"), "deminimal.txt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,17 +17,17 @@ package org.apache.lucene.analysis.de;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.InputStream;
|
||||||
import java.io.File;
|
import java.io.Reader;
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the German stemmer. The stemming algorithm is known to work less
|
* Test the German stemmer. The stemming algorithm is known to work less
|
||||||
|
@ -38,25 +38,18 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer));
|
@Override
|
||||||
// read test cases from external file:
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
InputStreamReader isr = new InputStreamReader(getClass().getResourceAsStream("data.txt"), "iso-8859-1");
|
Reader reader) {
|
||||||
BufferedReader breader = new BufferedReader(isr);
|
Tokenizer t = new KeywordTokenizer(reader);
|
||||||
while(true) {
|
return new TokenStreamComponents(t,
|
||||||
String line = breader.readLine();
|
new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, t)));
|
||||||
if (line == null)
|
}
|
||||||
break;
|
};
|
||||||
line = line.trim();
|
|
||||||
if (line.startsWith("#") || line.equals(""))
|
InputStream vocOut = getClass().getResourceAsStream("data.txt");
|
||||||
continue; // ignore comments and empty lines
|
assertVocabulary(analyzer, vocOut);
|
||||||
String[] parts = line.split(";");
|
vocOut.close();
|
||||||
//System.out.println(parts[0] + " -- " + parts[1]);
|
|
||||||
tokenizer.reset(new StringReader(parts[0]));
|
|
||||||
filter.reset();
|
|
||||||
assertTokenStreamContents(filter, new String[] { parts[1] });
|
|
||||||
}
|
|
||||||
breader.close();
|
|
||||||
isr.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,48 +1,48 @@
|
||||||
# German special characters are replaced:
|
# German special characters are replaced:
|
||||||
häufig;haufig
|
häufig haufig
|
||||||
|
|
||||||
# here the stemmer works okay, it maps related words to the same stem:
|
# here the stemmer works okay, it maps related words to the same stem:
|
||||||
abschließen;abschliess
|
abschließen abschliess
|
||||||
abschließender;abschliess
|
abschließender abschliess
|
||||||
abschließendes;abschliess
|
abschließendes abschliess
|
||||||
abschließenden;abschliess
|
abschließenden abschliess
|
||||||
|
|
||||||
Tisch;tisch
|
Tisch tisch
|
||||||
Tische;tisch
|
Tische tisch
|
||||||
Tischen;tisch
|
Tischen tisch
|
||||||
|
|
||||||
Haus;hau
|
Haus hau
|
||||||
Hauses;hau
|
Hauses hau
|
||||||
Häuser;hau
|
Häuser hau
|
||||||
Häusern;hau
|
Häusern hau
|
||||||
# here's a case where overstemming occurs, i.e. a word is
|
# here's a case where overstemming occurs, i.e. a word is
|
||||||
# mapped to the same stem as unrelated words:
|
# mapped to the same stem as unrelated words:
|
||||||
hauen;hau
|
hauen hau
|
||||||
|
|
||||||
# here's a case where understemming occurs, i.e. two related words
|
# here's a case where understemming occurs, i.e. two related words
|
||||||
# are not mapped to the same stem. This is the case with basically
|
# are not mapped to the same stem. This is the case with basically
|
||||||
# all irregular forms:
|
# all irregular forms:
|
||||||
Drama;drama
|
Drama drama
|
||||||
Dramen;dram
|
Dramen dram
|
||||||
|
|
||||||
# replace "ß" with 'ss':
|
# replace "ß" with 'ss':
|
||||||
Ausmaß;ausmass
|
Ausmaß ausmass
|
||||||
|
|
||||||
# fake words to test if suffixes are cut off:
|
# fake words to test if suffixes are cut off:
|
||||||
xxxxxe;xxxxx
|
xxxxxe xxxxx
|
||||||
xxxxxs;xxxxx
|
xxxxxs xxxxx
|
||||||
xxxxxn;xxxxx
|
xxxxxn xxxxx
|
||||||
xxxxxt;xxxxx
|
xxxxxt xxxxx
|
||||||
xxxxxem;xxxxx
|
xxxxxem xxxxx
|
||||||
xxxxxer;xxxxx
|
xxxxxer xxxxx
|
||||||
xxxxxnd;xxxxx
|
xxxxxnd xxxxx
|
||||||
# the suffixes are also removed when combined:
|
# the suffixes are also removed when combined:
|
||||||
xxxxxetende;xxxxx
|
xxxxxetende xxxxx
|
||||||
|
|
||||||
# words that are shorter than four charcters are not changed:
|
# words that are shorter than four charcters are not changed:
|
||||||
xxe;xxe
|
xxe xxe
|
||||||
# -em and -er are not removed from words shorter than five characters:
|
# -em and -er are not removed from words shorter than five characters:
|
||||||
xxem;xxem
|
xxem xxem
|
||||||
xxer;xxer
|
xxer xxer
|
||||||
# -nd is not removed from words shorter than six characters:
|
# -nd is not removed from words shorter than six characters:
|
||||||
xxxnd;xxxnd
|
xxxnd xxxnd
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,54 @@
|
||||||
|
package org.apache.lucene.analysis.en;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link EnglishMinimalStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestEnglishMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new EnglishMinimalStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test some examples from various papers about this technique */
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
checkOneTerm(analyzer, "queries", "query");
|
||||||
|
checkOneTerm(analyzer, "phrases", "phrase");
|
||||||
|
checkOneTerm(analyzer, "corpus", "corpus");
|
||||||
|
checkOneTerm(analyzer, "stress", "stress");
|
||||||
|
checkOneTerm(analyzer, "kings", "king");
|
||||||
|
checkOneTerm(analyzer, "panels", "panel");
|
||||||
|
checkOneTerm(analyzer, "aerodynamics", "aerodynamic");
|
||||||
|
checkOneTerm(analyzer, "congress", "congress");
|
||||||
|
checkOneTerm(analyzer, "serious", "serious");
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,21 +17,22 @@ package org.apache.lucene.analysis.en;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.Reader;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.zip.ZipFile;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the PorterStemFilter with Martin Porter's test data.
|
* Test the PorterStemFilter with Martin Porter's test data.
|
||||||
*/
|
*/
|
||||||
|
@ -41,26 +42,16 @@ public class TestPorterStemFilter extends BaseTokenStreamTestCase {
|
||||||
* The output should be the same as the string in output.txt
|
* The output should be the same as the string in output.txt
|
||||||
*/
|
*/
|
||||||
public void testPorterStemFilter() throws Exception {
|
public void testPorterStemFilter() throws Exception {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
TokenStream filter = new PorterStemFilter(tokenizer);
|
@Override
|
||||||
ZipFile zipFile = new ZipFile(getDataFile("porterTestData.zip"));
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
InputStream voc = zipFile.getInputStream(zipFile.getEntry("voc.txt"));
|
Reader reader) {
|
||||||
InputStream out = zipFile.getInputStream(zipFile.getEntry("output.txt"));
|
Tokenizer t = new KeywordTokenizer(reader);
|
||||||
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
|
return new TokenStreamComponents(t, new PorterStemFilter(t));
|
||||||
voc, "UTF-8"));
|
}
|
||||||
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
|
};
|
||||||
out, "UTF-8"));
|
|
||||||
String inputWord = null;
|
assertVocabulary(a, getDataFile("porterTestData.zip"), "voc.txt", "output.txt");
|
||||||
while ((inputWord = vocReader.readLine()) != null) {
|
|
||||||
String expectedWord = outputReader.readLine();
|
|
||||||
assertNotNull(expectedWord);
|
|
||||||
tokenizer.reset(new StringReader(inputWord));
|
|
||||||
filter.reset();
|
|
||||||
assertTokenStreamContents(filter, new String[] { expectedWord });
|
|
||||||
}
|
|
||||||
vocReader.close();
|
|
||||||
outputReader.close();
|
|
||||||
zipFile.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.es;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link SpanishLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestSpanishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new SpanishLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("eslighttestdata.zip"), "eslight.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.fi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link FinnishLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestFinnishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new FinnishLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("filighttestdata.zip"), "filight.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,162 @@
|
||||||
|
package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link FrenchLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestFrenchLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new FrenchLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test some examples from the paper */
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
checkOneTerm(analyzer, "chevaux", "cheval");
|
||||||
|
checkOneTerm(analyzer, "cheval", "cheval");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "hiboux", "hibou");
|
||||||
|
checkOneTerm(analyzer, "hibou", "hibou");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "chantés", "chant");
|
||||||
|
checkOneTerm(analyzer, "chanter", "chant");
|
||||||
|
checkOneTerm(analyzer, "chante", "chant");
|
||||||
|
checkOneTerm(analyzer, "chant", "chant");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "baronnes", "baron");
|
||||||
|
checkOneTerm(analyzer, "barons", "baron");
|
||||||
|
checkOneTerm(analyzer, "baron", "baron");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "peaux", "peau");
|
||||||
|
checkOneTerm(analyzer, "peau", "peau");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "anneaux", "aneau");
|
||||||
|
checkOneTerm(analyzer, "anneau", "aneau");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "neveux", "neveu");
|
||||||
|
checkOneTerm(analyzer, "neveu", "neveu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "affreux", "afreu");
|
||||||
|
checkOneTerm(analyzer, "affreuse", "afreu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "investissement", "investi");
|
||||||
|
checkOneTerm(analyzer, "investir", "investi");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "assourdissant", "asourdi");
|
||||||
|
checkOneTerm(analyzer, "assourdir", "asourdi");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "pratiquement", "pratiqu");
|
||||||
|
checkOneTerm(analyzer, "pratique", "pratiqu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "administrativement", "administratif");
|
||||||
|
checkOneTerm(analyzer, "administratif", "administratif");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "justificatrice", "justifi");
|
||||||
|
checkOneTerm(analyzer, "justificateur", "justifi");
|
||||||
|
checkOneTerm(analyzer, "justifier", "justifi");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "educatrice", "eduqu");
|
||||||
|
checkOneTerm(analyzer, "eduquer", "eduqu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "communicateur", "comuniqu");
|
||||||
|
checkOneTerm(analyzer, "communiquer", "comuniqu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "accompagnatrice", "acompagn");
|
||||||
|
checkOneTerm(analyzer, "accompagnateur", "acompagn");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "administrateur", "administr");
|
||||||
|
checkOneTerm(analyzer, "administrer", "administr");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "productrice", "product");
|
||||||
|
checkOneTerm(analyzer, "producteur", "product");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "acheteuse", "achet");
|
||||||
|
checkOneTerm(analyzer, "acheteur", "achet");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "planteur", "plant");
|
||||||
|
checkOneTerm(analyzer, "plante", "plant");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "poreuse", "poreu");
|
||||||
|
checkOneTerm(analyzer, "poreux", "poreu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "plieuse", "plieu");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "bijoutière", "bijouti");
|
||||||
|
checkOneTerm(analyzer, "bijoutier", "bijouti");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "caissière", "caisi");
|
||||||
|
checkOneTerm(analyzer, "caissier", "caisi");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "abrasive", "abrasif");
|
||||||
|
checkOneTerm(analyzer, "abrasif", "abrasif");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "folle", "fou");
|
||||||
|
checkOneTerm(analyzer, "fou", "fou");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "personnelle", "person");
|
||||||
|
checkOneTerm(analyzer, "personne", "person");
|
||||||
|
|
||||||
|
// algo bug: too short length
|
||||||
|
//checkOneTerm(analyzer, "personnel", "person");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "complète", "complet");
|
||||||
|
checkOneTerm(analyzer, "complet", "complet");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "aromatique", "aromat");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "faiblesse", "faibl");
|
||||||
|
checkOneTerm(analyzer, "faible", "faibl");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "patinage", "patin");
|
||||||
|
checkOneTerm(analyzer, "patin", "patin");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "sonorisation", "sono");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "ritualisation", "rituel");
|
||||||
|
checkOneTerm(analyzer, "rituel", "rituel");
|
||||||
|
|
||||||
|
// algo bug: masked by rules above
|
||||||
|
//checkOneTerm(analyzer, "colonisateur", "colon");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "nomination", "nomin");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "disposition", "dispos");
|
||||||
|
checkOneTerm(analyzer, "dispose", "dispos");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("frlighttestdata.zip"), "frlight.txt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,62 @@
|
||||||
|
package org.apache.lucene.analysis.fr;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link FrenchMinimalStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestFrenchMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new FrenchMinimalStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test some examples from the paper */
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
checkOneTerm(analyzer, "chevaux", "cheval");
|
||||||
|
checkOneTerm(analyzer, "hiboux", "hibou");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "chantés", "chant");
|
||||||
|
checkOneTerm(analyzer, "chanter", "chant");
|
||||||
|
checkOneTerm(analyzer, "chante", "chant");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "baronnes", "baron");
|
||||||
|
checkOneTerm(analyzer, "barons", "baron");
|
||||||
|
checkOneTerm(analyzer, "baron", "baron");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("frminimaltestdata.zip"), "frminimal.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.hu;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link HungarianLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestHungarianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new HungarianLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("hulighttestdata.zip"), "hulight.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.it;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link ItalianLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestItalianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new ItalianLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("itlighttestdata.zip"), "itlight.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,95 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link PortugueseLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestPortugueseLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
|
||||||
|
return new TokenStreamComponents(source, new PortugueseLightStemFilter(result));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the example from the paper "Assessing the impact of stemming accuracy
|
||||||
|
* on information retrieval"
|
||||||
|
*/
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
assertAnalyzesTo(
|
||||||
|
analyzer,
|
||||||
|
"O debate político, pelo menos o que vem a público, parece, de modo nada "
|
||||||
|
+ "surpreendente, restrito a temas menores. Mas há, evidentemente, "
|
||||||
|
+ "grandes questões em jogo nas eleições que se aproximam.",
|
||||||
|
new String[] {
|
||||||
|
"o", "debat", "politic", "pelo", "meno", "o", "que", "vem", "a",
|
||||||
|
"public", "parec", "de", "modo", "nada", "surpreendent", "restrit",
|
||||||
|
"a", "tema", "menor", "mas", "há", "evident", "grand", "questa",
|
||||||
|
"em", "jogo", "nas", "eleica", "que", "se", "aproximam"
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test examples from the c implementation
|
||||||
|
*/
|
||||||
|
public void testMoreExamples() throws IOException {
|
||||||
|
checkOneTerm(analyzer, "doutores", "doutor");
|
||||||
|
checkOneTerm(analyzer, "doutor", "doutor");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "homens", "homem");
|
||||||
|
checkOneTerm(analyzer, "homem", "homem");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "papéis", "papel");
|
||||||
|
checkOneTerm(analyzer, "papel", "papel");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "normais", "normal");
|
||||||
|
checkOneTerm(analyzer, "normal", "normal");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "lencóis", "lencol");
|
||||||
|
checkOneTerm(analyzer, "lencol", "lencol");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "barris", "barril");
|
||||||
|
checkOneTerm(analyzer, "barril", "barril");
|
||||||
|
|
||||||
|
checkOneTerm(analyzer, "botões", "bota");
|
||||||
|
checkOneTerm(analyzer, "botão", "bota");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("ptlighttestdata.zip"), "ptlight.txt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,69 @@
|
||||||
|
package org.apache.lucene.analysis.pt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link PortugueseMinimalStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestPortugueseMinimalStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
TokenStream result = new LowerCaseFilter(TEST_VERSION_CURRENT, source);
|
||||||
|
return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(result));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test the example from the paper "Assessing the impact of stemming accuracy
|
||||||
|
* on information retrieval"
|
||||||
|
*/
|
||||||
|
public void testExamples() throws IOException {
|
||||||
|
assertAnalyzesTo(
|
||||||
|
analyzer,
|
||||||
|
"O debate político, pelo menos o que vem a público, parece, de modo nada "
|
||||||
|
+ "surpreendente, restrito a temas menores. Mas há, evidentemente, "
|
||||||
|
+ "grandes questões em jogo nas eleições que se aproximam.",
|
||||||
|
new String[] {
|
||||||
|
"o", "debate", "político", "pelo", "menos", "o", "que", "vem", "a",
|
||||||
|
"público", "parece", "de", "modo", "nada", "surpreendente", "restrito",
|
||||||
|
"a", "tema", "menor", "mas", "há", "evidentemente", "grande", "questão",
|
||||||
|
"em", "jogo", "na", "eleição", "que", "se", "aproximam"
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("ptminimaltestdata.zip"), "ptminimal.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.ru;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link RussianLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestRussianLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new RussianLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("rulighttestdata.zip"), "rulight.txt");
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,71 +17,35 @@ package org.apache.lucene.analysis.ru;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import java.io.BufferedReader;
|
import java.io.IOException;
|
||||||
import java.io.File;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.Reader;
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.util.ArrayList;
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
|
* @deprecated Remove this test class (and its datafiles!) in Lucene 4.0
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public class TestRussianStem extends LuceneTestCase
|
public class TestRussianStem extends LuceneTestCase {
|
||||||
{
|
public void testStem() throws IOException {
|
||||||
private ArrayList<String> words = new ArrayList<String>();
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
private ArrayList<String> stems = new ArrayList<String>();
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
public TestRussianStem(String name)
|
Reader reader) {
|
||||||
{
|
Tokenizer t = new KeywordTokenizer(reader);
|
||||||
super(name);
|
return new TokenStreamComponents(t, new RussianStemFilter(t));
|
||||||
}
|
}
|
||||||
|
};
|
||||||
/**
|
InputStream voc = getClass().getResourceAsStream("wordsUTF8.txt");
|
||||||
* @see TestCase#setUp()
|
InputStream out = getClass().getResourceAsStream("stemsUTF8.txt");
|
||||||
*/
|
assertVocabulary(a, voc, out);
|
||||||
@Override
|
voc.close();
|
||||||
protected void setUp() throws Exception {
|
out.close();
|
||||||
super.setUp();
|
}
|
||||||
//System.out.println(new java.util.Date());
|
|
||||||
String str;
|
|
||||||
|
|
||||||
// open and read words into an array list
|
|
||||||
BufferedReader inWords =
|
|
||||||
new BufferedReader(
|
|
||||||
new InputStreamReader(
|
|
||||||
getClass().getResourceAsStream("wordsUTF8.txt"),
|
|
||||||
"UTF-8"));
|
|
||||||
while ((str = inWords.readLine()) != null)
|
|
||||||
{
|
|
||||||
words.add(str);
|
|
||||||
}
|
|
||||||
inWords.close();
|
|
||||||
|
|
||||||
// open and read stems into an array list
|
|
||||||
BufferedReader inStems =
|
|
||||||
new BufferedReader(
|
|
||||||
new InputStreamReader(
|
|
||||||
getClass().getResourceAsStream("stemsUTF8.txt"),
|
|
||||||
"UTF-8"));
|
|
||||||
while ((str = inStems.readLine()) != null)
|
|
||||||
{
|
|
||||||
stems.add(str);
|
|
||||||
}
|
|
||||||
inStems.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testStem()
|
|
||||||
{
|
|
||||||
for (int i = 0; i < words.size(); i++)
|
|
||||||
{
|
|
||||||
//if ( (i % 100) == 0 ) System.err.println(i);
|
|
||||||
String realStem =
|
|
||||||
RussianStemmer.stemWord(
|
|
||||||
words.get(i));
|
|
||||||
assertEquals("unicode", stems.get(i), realStem);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
|
@ -17,38 +17,21 @@ package org.apache.lucene.analysis.snowball;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.Reader;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.util.zip.ZipFile;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the snowball filters against the snowball data tests
|
* Test the snowball filters against the snowball data tests
|
||||||
*/
|
*/
|
||||||
public class TestSnowballVocab extends BaseTokenStreamTestCase {
|
public class TestSnowballVocab extends LuceneTestCase {
|
||||||
private Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
|
||||||
ZipFile zipFile = null;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void setUp() throws Exception {
|
|
||||||
super.setUp();
|
|
||||||
this.zipFile = new ZipFile(getDataFile("TestSnowballVocabData.zip"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void tearDown() throws Exception {
|
|
||||||
this.zipFile.close();
|
|
||||||
this.zipFile = null;
|
|
||||||
super.tearDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run all languages against their snowball vocabulary tests.
|
* Run all languages against their snowball vocabulary tests.
|
||||||
*/
|
*/
|
||||||
|
@ -82,25 +65,20 @@ public class TestSnowballVocab extends BaseTokenStreamTestCase {
|
||||||
* For the supplied language, run the stemmer against all strings in voc.txt
|
* For the supplied language, run the stemmer against all strings in voc.txt
|
||||||
* The output should be the same as the string in output.txt
|
* The output should be the same as the string in output.txt
|
||||||
*/
|
*/
|
||||||
private void assertCorrectOutput(String snowballLanguage, String dataDirectory)
|
private void assertCorrectOutput(final String snowballLanguage, String dataDirectory)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
|
if (VERBOSE) System.out.println("checking snowball language: " + snowballLanguage);
|
||||||
TokenStream filter = new SnowballFilter(tokenizer, snowballLanguage);
|
|
||||||
InputStream voc = zipFile.getInputStream(zipFile.getEntry(dataDirectory + "/voc.txt"));
|
Analyzer a = new ReusableAnalyzerBase() {
|
||||||
InputStream out = zipFile.getInputStream(zipFile.getEntry(dataDirectory + "/output.txt"));
|
@Override
|
||||||
BufferedReader vocReader = new BufferedReader(new InputStreamReader(
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
voc, "UTF-8"));
|
Reader reader) {
|
||||||
BufferedReader outputReader = new BufferedReader(new InputStreamReader(
|
Tokenizer t = new KeywordTokenizer(reader);
|
||||||
out, "UTF-8"));
|
return new TokenStreamComponents(t, new SnowballFilter(t, snowballLanguage));
|
||||||
String inputWord = null;
|
}
|
||||||
while ((inputWord = vocReader.readLine()) != null) {
|
};
|
||||||
String expectedWord = outputReader.readLine();
|
|
||||||
assertNotNull(expectedWord);
|
assertVocabulary(a, getDataFile("TestSnowballVocabData.zip"),
|
||||||
tokenizer.reset(new StringReader(inputWord));
|
dataDirectory + "/voc.txt", dataDirectory + "/output.txt");
|
||||||
filter.reset();
|
|
||||||
assertTokenStreamContents(filter, new String[] {expectedWord});
|
|
||||||
}
|
|
||||||
vocReader.close();
|
|
||||||
outputReader.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
package org.apache.lucene.analysis.sv;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests for {@link SwedishLightStemFilter}
|
||||||
|
*/
|
||||||
|
public class TestSwedishLightStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private Analyzer analyzer = new ReusableAnalyzerBase() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
Reader reader) {
|
||||||
|
Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
|
return new TokenStreamComponents(source, new SwedishLightStemFilter(source));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Test against a vocabulary from the reference impl */
|
||||||
|
public void testVocabulary() throws IOException {
|
||||||
|
assertVocabulary(analyzer, getDataFile("svlighttestdata.zip"), "svlight.txt");
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,83 @@
|
||||||
|
package org.apache.lucene.analysis.util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.zip.ZipFile;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.junit.Assert;
|
||||||
|
|
||||||
|
/** Utility class for doing vocabulary-based stemming tests */
|
||||||
|
public class VocabularyAssert {
|
||||||
|
/** Run a vocabulary test against two data files. */
|
||||||
|
public static void assertVocabulary(Analyzer a, InputStream voc, InputStream out)
|
||||||
|
throws IOException {
|
||||||
|
BufferedReader vocReader = new BufferedReader(
|
||||||
|
new InputStreamReader(voc, "UTF-8"));
|
||||||
|
BufferedReader outputReader = new BufferedReader(
|
||||||
|
new InputStreamReader(out, "UTF-8"));
|
||||||
|
String inputWord = null;
|
||||||
|
while ((inputWord = vocReader.readLine()) != null) {
|
||||||
|
String expectedWord = outputReader.readLine();
|
||||||
|
Assert.assertNotNull(expectedWord);
|
||||||
|
BaseTokenStreamTestCase.checkOneTermReuse(a, inputWord, expectedWord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a vocabulary test against one file: tab separated. */
|
||||||
|
public static void assertVocabulary(Analyzer a, InputStream vocOut)
|
||||||
|
throws IOException {
|
||||||
|
BufferedReader vocReader = new BufferedReader(
|
||||||
|
new InputStreamReader(vocOut, "UTF-8"));
|
||||||
|
String inputLine = null;
|
||||||
|
while ((inputLine = vocReader.readLine()) != null) {
|
||||||
|
if (inputLine.startsWith("#") || inputLine.trim().length() == 0)
|
||||||
|
continue; /* comment */
|
||||||
|
String words[] = inputLine.split("\t");
|
||||||
|
BaseTokenStreamTestCase.checkOneTermReuse(a, words[0], words[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a vocabulary test against two data files inside a zip file */
|
||||||
|
public static void assertVocabulary(Analyzer a, File zipFile, String voc, String out)
|
||||||
|
throws IOException {
|
||||||
|
ZipFile zip = new ZipFile(zipFile);
|
||||||
|
InputStream v = zip.getInputStream(zip.getEntry(voc));
|
||||||
|
InputStream o = zip.getInputStream(zip.getEntry(out));
|
||||||
|
assertVocabulary(a, v, o);
|
||||||
|
v.close();
|
||||||
|
o.close();
|
||||||
|
zip.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a vocabulary test against a tab-separated data file inside a zip file */
|
||||||
|
public static void assertVocabulary(Analyzer a, File zipFile, String vocOut)
|
||||||
|
throws IOException {
|
||||||
|
ZipFile zip = new ZipFile(zipFile);
|
||||||
|
InputStream vo = zip.getInputStream(zip.getEntry(vocOut));
|
||||||
|
assertVocabulary(a, vo);
|
||||||
|
vo.close();
|
||||||
|
zip.close();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link EnglishMinimalStemFilter} */
|
||||||
|
public class EnglishMinimalStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new EnglishMinimalStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.fi.FinnishLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link FinnishLightStemFilter} */
|
||||||
|
public class FinnishLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new FinnishLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.fr.FrenchLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link FrenchLightStemFilter} */
|
||||||
|
public class FrenchLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new FrenchLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link FrenchMinimalStemFilter} */
|
||||||
|
public class FrenchMinimalStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new FrenchMinimalStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.de.GermanLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link GermanLightStemFilter} */
|
||||||
|
public class GermanLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new GermanLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.de.GermanMinimalStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link GermanMinimalStemFilter} */
|
||||||
|
public class GermanMinimalStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new GermanMinimalStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.hu.HungarianLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link HungarianLightStemFilter} */
|
||||||
|
public class HungarianLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new HungarianLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.it.ItalianLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link ItalianLightStemFilter} */
|
||||||
|
public class ItalianLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new ItalianLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.pt.PortugueseLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link PortugueseLightStemFilter} */
|
||||||
|
public class PortugueseLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new PortugueseLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.pt.PortugueseMinimalStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link PortugueseMinimalStemFilter} */
|
||||||
|
public class PortugueseMinimalStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new PortugueseMinimalStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.ru.RussianLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link RussianLightStemFilter} */
|
||||||
|
public class RussianLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new RussianLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.es.SpanishLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link SpanishLightStemFilter} */
|
||||||
|
public class SpanishLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new SpanishLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,28 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.sv.SwedishLightStemFilter;
|
||||||
|
|
||||||
|
/** Factory for {@link SwedishLightStemFilter} */
|
||||||
|
public class SwedishLightStemFilterFactory extends BaseTokenFilterFactory {
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new SwedishLightStemFilter(input);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the English minimal stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestEnglishMinimalStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("bricks");
|
||||||
|
EnglishMinimalStemFilterFactory factory = new EnglishMinimalStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "brick" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Finnish light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestFinnishLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("aseistettujen");
|
||||||
|
FinnishLightStemFilterFactory factory = new FinnishLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "aseistet" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the French light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestFrenchLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("administrativement");
|
||||||
|
FrenchLightStemFilterFactory factory = new FrenchLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "administratif" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the French minimal stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestFrenchMinimalStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("chevaux");
|
||||||
|
FrenchMinimalStemFilterFactory factory = new FrenchMinimalStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "cheval" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the German light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestGermanLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("häuser");
|
||||||
|
GermanLightStemFilterFactory factory = new GermanLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "haus" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the German minimal stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestGermanMinimalStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("bilder");
|
||||||
|
GermanMinimalStemFilterFactory factory = new GermanMinimalStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "bild" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Hungarian light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestHungarianLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("házakat");
|
||||||
|
HungarianLightStemFilterFactory factory = new HungarianLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "haz" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Italian light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestItalianLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("ragazzo ragazzi");
|
||||||
|
ItalianLightStemFilterFactory factory = new ItalianLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "ragazz", "ragazz" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Portuguese Light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestPortugueseLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("evidentemente");
|
||||||
|
PortugueseLightStemFilterFactory factory = new PortugueseLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "evident" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Portuguese Minimal stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestPortugueseMinimalStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("questões");
|
||||||
|
PortugueseMinimalStemFilterFactory factory = new PortugueseMinimalStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "questão" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Russian light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestRussianLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("журналы");
|
||||||
|
RussianLightStemFilterFactory factory = new RussianLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "журнал" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Spanish Light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestSpanishLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("sociedades");
|
||||||
|
SpanishLightStemFilterFactory factory = new SpanishLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "sociedad" });
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Swedish Light stem factory is working.
|
||||||
|
*/
|
||||||
|
public class TestSwedishLightStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("äpplen äpple");
|
||||||
|
SwedishLightStemFilterFactory factory = new SwedishLightStemFilterFactory();
|
||||||
|
TokenStream stream = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, reader));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "äppl", "äppl" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue