test case for the German stemmer which also shows its limitations

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150407 13f79535-47bb-0310-9956-ffa450edef68
2004-08-08 10:55:27 +00:00 · 2004-08-08 10:55:27 +00:00 · f24f300492
parent 1268d0deed
commit f24f300492
2 changed files with 126 additions and 0 deletions
--- a/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
+++ b/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
@ -0,0 +1,78 @@
+package org.apache.lucene.analysis.de;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the German stemmer. The stemming algorithm is known to work less 
+ * than perfect, as it doesn't use any word lists with exceptions. We 
+ * also check some of the cases where the algorithm is wrong.
+ * 
+ * @author Daniel Naber
+ */
+public class TestGermanStemFilter extends TestCase {
+
+  public void testStemming() {
+    try {
+      // read test cases from external file:
+      File dataDir = new File(System.getProperty("dataDir"));
+      File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
+      FileInputStream fis = new FileInputStream(testFile);
+      InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
+      BufferedReader breader = new BufferedReader(isr);
+      while(true) {
+        String line = breader.readLine();
+        if (line == null)
+          break;
+        line = line.trim();
+        if (line.startsWith("#") || line.equals(""))
+          continue;    // ignore comments and empty lines
+        String[] parts = line.split(";");
+        //System.out.println(parts[0] + " -- " + parts[1]);
+        check(parts[0], parts[1]);
+      }
+      breader.close();
+      isr.close();
+      fis.close();
+    } catch (IOException e) {
+       e.printStackTrace();
+       fail();
+    }
+  }
+
+  private void check(final String input, final String expected) throws IOException {
+    StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
+    GermanStemFilter filter = new GermanStemFilter(tokenStream);
+    Token t = filter.next();
+    if (t == null)
+      fail();
+    assertEquals(expected, t.termText());
+    filter.close();
+  }
+
+}
--- a/src/test/org/apache/lucene/analysis/de/data.txt
+++ b/src/test/org/apache/lucene/analysis/de/data.txt
@ -0,0 +1,48 @@
+# German special characters are replaced:
+häufig;haufig
+
+# here the stemmer works okay, it maps related words to the same stem:
+abschließen;abschliess
+abschließender;abschliess
+abschließendes;abschliess
+abschließenden;abschliess
+
+Tisch;tisch
+Tische;tisch
+Tischen;tisch
+
+Haus;hau
+Hauses;hau
+Häuser;hau
+Häusern;hau
+# here's a case where overstemming occurs, i.e. a word is 
+# mapped to the same stem as unrelated words:
+hauen;hau
+
+# here's a case where understemming occurs, i.e. two related words
+# are not mapped to the same stem. This is the case with basically
+# all irregular forms:
+Drama;drama
+Dramen;dram
+
+# TODO: known bug: "ß" at the end of a word isn't replaced:
+Ausmaß;ausmaß
+
+# fake words to test if suffixes are cut off:
+xxxxxe;xxxxx
+xxxxxs;xxxxx
+xxxxxn;xxxxx
+xxxxxt;xxxxx
+xxxxxem;xxxxx
+xxxxxer;xxxxx
+xxxxxnd;xxxxx
+# the suffixes are also removed when combined:
+xxxxxetende;xxxxx
+
+# words that are shorter than four charcters are not changed:
+xxe;xxe
+# -em and -er are not removed from words shorter than five characters:
+xxem;xxem
+xxer;xxer
+# -nd is not removed from words shorter than six characters:
+xxxnd;xxxnd