LUCENE-1692: add tests for Thai & SmartChinese analyzers; fix wrong endOffset bug in ThaiWordFilter; use stop words by default with SmartChineseAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@786560 13f79535-47bb-0310-9956-ffa450edef68
2009-06-19 15:52:36 +00:00 · 2009-06-19 15:52:36 +00:00 · 2f2cd20828
parent fc243f12ce
commit 2f2cd20828
6 changed files with 126 additions and 91343 deletions
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
@ -58,7 +58,7 @@ public class SmartChineseAnalyzer extends Analyzer {
  private WordSegmenter wordSegment;

  public SmartChineseAnalyzer() {
-    this(false);
+    this(true);
  }

  /**
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/stems.txt
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/stems.txt
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/words.txt
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/nl/words.txt
--- a/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@ -47,7 +47,7 @@ public class ThaiWordFilter extends TokenFilter {
      if (end != BreakIterator.DONE) {
        reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
        reusableToken.setStartOffset(thaiToken.startOffset()+start);
-        reusableToken.setEndOffset(thaiToken.endOffset()+end);
+        reusableToken.setEndOffset(thaiToken.startOffset()+end);
        return reusableToken;
      }
      thaiToken = null;
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
@ -31,7 +31,27 @@ import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;

 public class TestSmartChineseAnalyzer extends TestCase {
-
+  
+  public void testChineseStopWordsDefault() throws Exception {
+    Analyzer ca = new SmartChineseAnalyzer(); /* will load stopwords */
+    String sentence = "我购买了道具和服装。";
+    String result[] = { "我", "购买", "了", "道具", "和", "服装" };
+    assertAnalyzesTo(ca, sentence, result);
+  }
+  
+  /*
+   * Punctuation is handled in a strange way if you disable stopwords
+   * In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
+   * if you don't supply (true) to the constructor, or use a different stopwords list,
+   * then punctuation is indexed.
+   */
+  public void testChineseStopWordsOff() throws Exception {  
+    Analyzer ca = new SmartChineseAnalyzer(false); /* doesnt load stopwords */
+    String sentence = "我购买了道具和服装。";
+    String result[] = { "我", "购买", "了", "道具", "和", "服装", "," };
+    assertAnalyzesTo(ca, sentence, result);
+  }
+  
  public void testChineseAnalyzer() throws IOException {
    Token nt = new Token();
    Analyzer ca = new SmartChineseAnalyzer(true);
@ -47,6 +67,54 @@ public class TestSmartChineseAnalyzer extends TestCase {
    }
    ts.close();
  }
+  
+  /*
+   * English words are lowercased and porter-stemmed.
+   */
+  public void testMixedLatinChinese() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买 Tests 了道具和服装", 
+        new String[] { "我", "购买", "test", "了", "道具", "和", "服装"});
+  }
+  
+  public void testOffsets() throws Exception {
+    assertAnalyzesTo(new SmartChineseAnalyzer(true), "我购买了道具和服装",
+        new String[] { "我", "购买", "了", "道具", "和", "服装" },
+        new int[] { 0, 1, 3, 4, 6, 7 },
+        new int[] { 1, 3, 4, 6, 7, 9 });
+  }
+  
+  public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
+  throws Exception {
+
+  TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+          final Token reusableToken = new Token();
+  for (int i = 0; i < output.length; i++) {
+      Token nextToken = ts.next(reusableToken);
+      assertNotNull(nextToken);
+      assertEquals(nextToken.term(), output[i]);
+      if (startOffsets != null)
+          assertEquals(nextToken.startOffset(), startOffsets[i]);
+      if (endOffsets != null)
+          assertEquals(nextToken.endOffset(), endOffsets[i]);
+      if (types != null)
+          assertEquals(nextToken.type(), types[i]);
+  }
+  assertNull(ts.next(reusableToken));
+  ts.close();
+}
+
+public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
+  assertAnalyzesTo(a, input, output, null, null, null);
+}
+
+public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception {
+  assertAnalyzesTo(a, input, output, null, null, types);
+}
+
+public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception {
+  assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null);
+}
+

  /**
   * @param args
--- a/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/th/TestThaiAnalyzer.java
@ -30,8 +30,43 @@ import org.apache.lucene.analysis.TokenStream;
 */

 public class TestThaiAnalyzer extends TestCase {
-
-	public void assertAnalyzesTo(Analyzer a, String input, String[] output)
+	
+	/* 
+	 * testcase for offsets
+	 */
+	public void testOffsets() throws Exception {
+		assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์", 
+				new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"},
+				new int[] { 0, 2, 7, 9, 12 },
+				new int[] { 2, 7, 9, 12, 17});
+	}
+	
+	
+	/*
+	 * Thai numeric tokens are typed as <ALPHANUM> instead of <NUM>.
+	 * This is really a problem with the interaction w/ StandardTokenizer, which is used by ThaiAnalyzer.
+	 * 
+	 * The issue is this: in StandardTokenizer the entire [:Thai:] block is specified in ALPHANUM (including punctuation, digits, etc)
+	 * Fix is easy: refine this spec to exclude thai punctuation and digits.
+	 * 
+	 * A better fix, that would also fix quite a few other languages would be to remove the thai hack.
+	 * Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
+	 */
+	public void testBuggyTokenType() throws Exception {
+		assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", 
+				new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" },
+				new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
+	}
+	
+	/* correct testcase
+	public void testTokenType() throws Exception {
+		assertAnalyzesTo(new ThaiAnalyzer(), "เดอะนิวยอร์กไทมส์ ๑๒๓", 
+				new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" },
+				new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<NUM>" });
+	}
+	*/
+	
+	public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
 		throws Exception {

 		TokenStream ts = a.tokenStream("dummy", new StringReader(input));
@ -40,10 +75,28 @@ public class TestThaiAnalyzer extends TestCase {
 			Token nextToken = ts.next(reusableToken);
 			assertNotNull(nextToken);
 			assertEquals(nextToken.term(), output[i]);
+			if (startOffsets != null)
+				assertEquals(nextToken.startOffset(), startOffsets[i]);
+			if (endOffsets != null)
+				assertEquals(nextToken.endOffset(), endOffsets[i]);
+			if (types != null)
+				assertEquals(nextToken.type(), types[i]);
 		}
 		assertNull(ts.next(reusableToken));
 		ts.close();
 	}
+	
+	public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
+		assertAnalyzesTo(a, input, output, null, null, null);
+	}
+	
+	public void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws Exception {
+		assertAnalyzesTo(a, input, output, null, null, types);
+	}
+	
+	public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws Exception {
+		assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null);
+	}

 	public void testAnalyzer() throws Exception {
 		ThaiAnalyzer analyzer = new ThaiAnalyzer();