LUCENE-3700: optionally support naist-jdic for kuromoji

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232268 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-17 02:20:24 +00:00
parent 48c01e5a2b
commit f562a8a0dc
2 changed files with 19 additions and 6 deletions

View File

@ -25,10 +25,18 @@
<property name="build.dir" location="../build/kuromoji" />
<property name="dist.dir" location="../dist/kuromoji" />
<!-- default configuration: uses mecab-ipadic -->
<property name="ipadic.version" value="mecab-ipadic-2.7.0-20070801" />
<property name="dict.src.file" value="${ipadic.version}.tar.gz" />
<!-- <property name="dict.url" value="http://atilika.com/releases/mecab-ipadic/${dict.src.file}" /> -->
<property name="dict.url" value="http://mecab.googlecode.com/files/${dict.src.file}"/>
<!-- alternative configuration: uses mecab-naist-jdic
<property name="ipadic.version" value="mecab-naist-jdic-0.6.3b-20111013" />
<property name="dict.src.file" value="${ipadic.version}.tar.gz" />
<property name="dict.url" value="http://sourceforge.jp/frs/redir.php?m=iij&amp;f=/naist-jdic/53500/${dict.src.file}"/>
-->
<property name="dict.src.dir" value="${build.dir}/${ipadic.version}" />
<property name="dict.encoding" value="euc-jp"/>
<property name="dict.format" value="ipadic"/>

View File

@ -78,12 +78,17 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
);
}
/* Note this is really a stupid test just to see if things arent horribly slow.
* ideally the test would actually fail instead of hanging...
*/
public void testDecomposition5() throws Exception {
assertAnalyzesTo(analyzer, "くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ",
new String[] { "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ", "くよくよ" },
new int[] { 0, 4, 8, 12, 16, 20, 24, 28, 32, 36},
new int[] { 4, 8, 12, 16, 20, 24, 28, 32, 36, 40 }
);
TokenStream ts = analyzer.tokenStream("bogus", new StringReader("くよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよくよ"));
ts.reset();
while (ts.incrementToken()) {
}
ts.end();
ts.close();
}
/** Tests that sentence offset is incorporated into the resulting offsets */