add hepburn test and fix some corner cases

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339753 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-17 18:03:13 +00:00
parent 8db469ef01
commit fc84112ece
2 changed files with 142 additions and 5 deletions
lucene/analysis/kuromoji/src
java/org/apache/lucene/analysis/ja/util
test/org/apache/lucene/analysis/ja/util

View File

@ -252,6 +252,8 @@ public class ToStringUtil {
/**
* Romanize katakana with modified hepburn
*/
// TODO: now that this is used by readingsfilter and not just for
// debugging, fix this to really be a scheme that works best with IMEs
public static void getRomanization(Appendable builder, CharSequence s) throws IOException {
final int len = s.length();
for (int i = 0; i < len; i++) {
@ -522,6 +524,9 @@ public class ToStringUtil {
if (ch2 == 'ウ') {
builder.append("");
i++;
} else if (ch2 == 'ゥ') {
builder.append("tu");
i++;
} else {
builder.append("to");
}
@ -665,7 +670,7 @@ public class ToStringUtil {
builder.append("mu");
break;
case 'メ':
builder.append("mi");
builder.append("me");
break;
case 'モ':
if (ch2 == 'ウ') {
@ -690,7 +695,12 @@ public class ToStringUtil {
}
break;
case 'ラ':
if (ch2 == '゜') {
builder.append("la");
i++;
} else {
builder.append("ra");
}
break;
case 'リ':
if (ch2 == 'ョ' && ch3 == 'ウ') {
@ -711,20 +721,36 @@ public class ToStringUtil {
} else if (ch2 == 'ェ') {
builder.append("rye");
i++;
} else if (ch2 == '゜') {
builder.append("li");
i++;
} else {
builder.append("ri");
}
break;
case 'ル':
if (ch2 == '゜') {
builder.append("lu");
i++;
} else {
builder.append("ru");
}
break;
case 'レ':
if (ch2 == '゜') {
builder.append("le");
i++;
} else {
builder.append("re");
}
break;
case 'ロ':
if (ch2 == 'ウ') {
builder.append("");
i++;
} else if (ch2 == '゜') {
builder.append("lo");
i++;
} else {
builder.append("ro");
}
@ -887,7 +913,28 @@ public class ToStringUtil {
builder.append("da");
break;
case 'ヂ':
// TODO: investigate all this
if (ch2 == 'ョ' && ch3 == 'ウ') {
builder.append("");
i += 2;
} else if (ch2 == 'ュ' && ch3 == 'ウ') {
builder.append("");
i += 2;
} else if (ch2 == 'ャ') {
builder.append("ja");
i++;
} else if (ch2 == 'ョ') {
builder.append("jo");
i++;
} else if (ch2 == 'ュ') {
builder.append("ju");
i++;
} else if (ch2 == 'ェ') {
builder.append("je");
i++;
} else {
builder.append("ji");
}
break;
case 'ヅ':
builder.append("zu");
@ -994,6 +1041,18 @@ public class ToStringUtil {
builder.append("po");
}
break;
case 'ヷ':
builder.append("va");
break;
case 'ヸ':
builder.append("vi");
break;
case 'ヹ':
builder.append("ve");
break;
case 'ヺ':
builder.append("vo");
break;
case 'ヴ':
if (ch2 == 'ィ' && ch3 == 'ェ') {
builder.append("vye");

View File

@ -17,6 +17,9 @@ package org.apache.lucene.analysis.ja.util;
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.util.LuceneTestCase;
public class TestToStringUtil extends LuceneTestCase {
@ -31,4 +34,79 @@ public class TestToStringUtil extends LuceneTestCase {
assertEquals("chashu", ToStringUtil.getRomanization("チャーシュー"));
assertEquals("shumai", ToStringUtil.getRomanization("シューマイ"));
}
// see http://en.wikipedia.org/wiki/Hepburn_romanization,
// but this isnt even thorough or really probably what we want!
public void testHepburnTable() {
Map<String,String> table = new HashMap<String,String>() {{
put("", "a"); put("", "i"); put("", "u"); put("", "e"); put("", "o");
put("", "ka"); put("", "ki"); put("", "ku"); put("", "ke"); put("", "ko");
put("", "sa"); put("", "shi"); put("", "su"); put("", "se"); put("", "so");
put("", "ta"); put("", "chi"); put("", "tsu"); put("", "te"); put("", "to");
put("", "na"); put("", "ni"); put("", "nu"); put("", "ne"); put("", "no");
put("", "ha"); put("", "hi"); put("", "fu"); put("", "he"); put("", "ho");
put("", "ma"); put("", "mi"); put("", "mu"); put("", "me"); put("", "mo");
put("", "ya"); put("", "yu"); put("", "yo");
put("", "ra"); put("", "ri"); put("", "ru"); put("", "re"); put("", "ro");
put("", "wa"); put("", "i"); put("", "e"); put("", "o");
put("", "n");
put("", "ga"); put("", "gi"); put("", "gu"); put("", "ge"); put("", "go");
put("", "za"); put("", "ji"); put("", "zu"); put("", "ze"); put("", "zo");
put("", "da"); put("", "ji"); put("", "zu"); put("", "de"); put("", "do");
put("", "ba"); put("", "bi"); put("", "bu"); put("", "be"); put("", "bo");
put("", "pa"); put("", "pi"); put("", "pu"); put("", "pe"); put("", "po");
put("キャ", "kya"); put("キュ", "kyu"); put("キョ", "kyo");
put("シャ", "sha"); put("シュ", "shu"); put("ショ", "sho");
put("チャ", "cha"); put("チュ", "chu"); put("チョ", "cho");
put("ニャ", "nya"); put("ニュ", "nyu"); put("ニョ", "nyo");
put("ヒャ", "hya"); put("ヒュ", "hyu"); put("ヒョ", "hyo");
put("ミャ", "mya"); put("ミュ", "myu"); put("ミョ", "myo");
put("リャ", "rya"); put("リュ", "ryu"); put("リョ", "ryo");
put("ギャ", "gya"); put("ギュ", "gyu"); put("ギョ", "gyo");
put("ジャ", "ja"); put("ジュ", "ju"); put("ジョ", "jo");
put("ヂャ", "ja"); put("ヂュ", "ju"); put("ヂョ", "jo");
put("ビャ", "bya"); put("ビュ", "byu"); put("ビョ", "byo");
put("ピャ", "pya"); put("ピュ", "pyu"); put("ピョ", "pyo");
put("イィ", "yi"); put("イェ", "ye");
put("ウァ", "wa"); put("ウィ", "wi"); put("ウゥ", "wu"); put("ウェ", "we"); put("ウォ", "wo");
put("ウュ", "wyu");
// TODO: really should be vu
put("ヴァ", "va"); put("ヴィ", "vi"); put("", "v"); put("ヴェ", "ve"); put("ヴォ", "vo");
put("ヴャ", "vya"); put("ヴュ", "vyu"); put("ヴィェ", "vye"); put("ヴョ", "vyo");
put("キェ", "kye");
put("ギェ", "gye");
put("クァ", "kwa"); put("クィ", "kwi"); put("クェ", "kwe"); put("クォ", "kwo");
put("クヮ", "kwa");
put("グァ", "gwa"); put("グィ", "gwi"); put("グェ", "gwe"); put("グォ", "gwo");
put("グヮ", "gwa");
put("シェ", "she");
put("ジェ", "je");
put("スィ", "si");
put("ズィ", "zi");
put("チェ", "che");
put("ツァ", "tsa"); put("ツィ", "tsi"); put("ツェ", "tse"); put("ツォ", "tso");
put("ツュ", "tsyu");
put("ティ", "ti"); put("トゥ", "tu");
put("テュ", "tyu");
put("ディ", "di"); put("ドゥ", "du");
put("デュ", "dyu");
put("ニェ", "nye");
put("ヒェ", "hye");
put("ビェ", "bye");
put("ピェ", "pye");
put("ファ", "fa"); put("フィ", "fi"); put("フェ", "fe"); put("フォ", "fo");
put("フャ", "fya"); put("フュ", "fyu"); put("フィェ", "fye"); put("フョ", "fyo");
put("ホゥ", "hu");
put("ミェ", "mye");
put("リェ", "rye");
put("ラ゜", "la"); put("リ゜", "li"); put("ル゜", "lu"); put("レ゜", "le"); put("ロ゜", "lo");
put("", "va"); put("", "vi"); put("", "ve"); put("", "vo");
}};
for (String s : table.keySet()) {
assertEquals(s, table.get(s), ToStringUtil.getRomanization(s));
}
}
}