LUCENE-3700: optionally support naist-jdic for kuromoji

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232268 13f79535-47bb-0310-9956-ffa450edef68
2012-01-17 02:20:24 +00:00 · 2012-01-17 02:20:24 +00:00 · f562a8a0dc
parent 48c01e5a2b
commit f562a8a0dc
2 changed files with 19 additions and 6 deletions
--- a/modules/analysis/kuromoji/build.xml
+++ b/modules/analysis/kuromoji/build.xml
@ -25,10 +25,18 @@
 	
  <property name="build.dir" location="../build/kuromoji" />
  <property name="dist.dir" location="../dist/kuromoji" />
+
+  <!-- default configuration: uses mecab-ipadic -->
  <property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
  <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
-  <!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
  <property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
+
+  <!-- alternative configuration: uses mecab-naist-jdic
+  <property name="ipadic.version" value="mecab-naist-jdic-0.6.3b-20111013" />
+  <property name="dict.src.file" value="${ipadic.version}.tar.gz" />
+  <property name="dict.url" value="http://sourceforge.jp/frs/redir.php?m=iij&amp;f=/naist-jdic/53500/${dict.src.file}"/>
+  -->
+  
  <property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
  <property name="dict.encoding" value="euc-jp"/>
  <property name="dict.format" value="ipadic"/>
--- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
+++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
@ -78,12 +78,17 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
    );
  }

+  /* Note this is really a stupid test just to see if things arent horribly slow.
+   * ideally the test would actually fail instead of hanging...
+   */
  public void testDecomposition5() throws Exception {
-    assertAnalyzesTo(analyzer, "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ",
-      new String[] { "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ" },
-      new int[] { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36},
-      new int[] { 4, 8, 12, 16, 20, 24, 28, 32, 36, 40 }
-    );
+    TokenStream ts = analyzer.tokenStream("bogus", new StringReader("くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ"));
+    ts.reset();
+    while (ts.incrementToken()) {
+      
+    }
+    ts.end();
+    ts.close();
  }

  /** Tests that sentence offset is incorporated into the resulting offsets */