diff --git a/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java b/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java new file mode 100644 index 00000000000..91a31fc273f --- /dev/null +++ b/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java @@ -0,0 +1,78 @@ +package org.apache.lucene.analysis.de; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the German stemmer. The stemming algorithm is known to work less + * than perfect, as it doesn't use any word lists with exceptions. We + * also check some of the cases where the algorithm is wrong. + * + * @author Daniel Naber + */ +public class TestGermanStemFilter extends TestCase { + + public void testStemming() { + try { + // read test cases from external file: + File dataDir = new File(System.getProperty("dataDir")); + File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt"); + FileInputStream fis = new FileInputStream(testFile); + InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1"); + BufferedReader breader = new BufferedReader(isr); + while(true) { + String line = breader.readLine(); + if (line == null) + break; + line = line.trim(); + if (line.startsWith("#") || line.equals("")) + continue; // ignore comments and empty lines + String[] parts = line.split(";"); + //System.out.println(parts[0] + " -- " + parts[1]); + check(parts[0], parts[1]); + } + breader.close(); + isr.close(); + fis.close(); + } catch (IOException e) { + e.printStackTrace(); + fail(); + } + } + + private void check(final String input, final String expected) throws IOException { + StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input)); + GermanStemFilter filter = new GermanStemFilter(tokenStream); + Token t = filter.next(); + if (t == null) + fail(); + assertEquals(expected, t.termText()); + filter.close(); + } + +} diff --git a/src/test/org/apache/lucene/analysis/de/data.txt b/src/test/org/apache/lucene/analysis/de/data.txt new file mode 100644 index 00000000000..6458de4c4c9 --- /dev/null +++ b/src/test/org/apache/lucene/analysis/de/data.txt @@ -0,0 +1,48 @@ +# German special characters are replaced: +häufig;haufig + +# here the stemmer works okay, it maps related words to the same stem: +abschließen;abschliess +abschließender;abschliess +abschließendes;abschliess +abschließenden;abschliess + +Tisch;tisch +Tische;tisch +Tischen;tisch + +Haus;hau +Hauses;hau +Häuser;hau +Häusern;hau +# here's a case where overstemming occurs, i.e. a word is +# mapped to the same stem as unrelated words: +hauen;hau + +# here's a case where understemming occurs, i.e. two related words +# are not mapped to the same stem. This is the case with basically +# all irregular forms: +Drama;drama +Dramen;dram + +# TODO: known bug: "ß" at the end of a word isn't replaced: +Ausmaß;ausmaß + +# fake words to test if suffixes are cut off: +xxxxxe;xxxxx +xxxxxs;xxxxx +xxxxxn;xxxxx +xxxxxt;xxxxx +xxxxxem;xxxxx +xxxxxer;xxxxx +xxxxxnd;xxxxx +# the suffixes are also removed when combined: +xxxxxetende;xxxxx + +# words that are shorter than four charcters are not changed: +xxe;xxe +# -em and -er are not removed from words shorter than five characters: +xxem;xxem +xxer;xxer +# -nd is not removed from words shorter than six characters: +xxxnd;xxxnd