From fc84112ece8e7306083283c7d3b099e97bdbcbfb Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 17 May 2012 18:03:13 +0000 Subject: [PATCH] add hepburn test and fix some corner cases git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1339753 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/ja/util/ToStringUtil.java | 69 ++++++++++++++-- .../analysis/ja/util/TestToStringUtil.java | 78 +++++++++++++++++++ 2 files changed, 142 insertions(+), 5 deletions(-) diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ToStringUtil.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ToStringUtil.java index c83de194d7a..977ab49f0e8 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ToStringUtil.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/util/ToStringUtil.java @@ -252,6 +252,8 @@ public class ToStringUtil { /** * Romanize katakana with modified hepburn */ + // TODO: now that this is used by readingsfilter and not just for + // debugging, fix this to really be a scheme that works best with IMEs public static void getRomanization(Appendable builder, CharSequence s) throws IOException { final int len = s.length(); for (int i = 0; i < len; i++) { @@ -522,6 +524,9 @@ public class ToStringUtil { if (ch2 == 'ウ') { builder.append("tō"); i++; + } else if (ch2 == 'ゥ') { + builder.append("tu"); + i++; } else { builder.append("to"); } @@ -665,7 +670,7 @@ public class ToStringUtil { builder.append("mu"); break; case 'メ': - builder.append("mi"); + builder.append("me"); break; case 'モ': if (ch2 == 'ウ') { @@ -690,7 +695,12 @@ public class ToStringUtil { } break; case 'ラ': - builder.append("ra"); + if (ch2 == '゜') { + builder.append("la"); + i++; + } else { + builder.append("ra"); + } break; case 'リ': if (ch2 == 'ョ' && ch3 == 'ウ') { @@ -711,20 +721,36 @@ public class ToStringUtil { } else if (ch2 == 'ェ') { builder.append("rye"); i++; + } else if (ch2 == '゜') { + builder.append("li"); + i++; } else { builder.append("ri"); } break; case 'ル': - builder.append("ru"); + if (ch2 == '゜') { + builder.append("lu"); + i++; + } else { + builder.append("ru"); + } break; case 'レ': - builder.append("re"); + if (ch2 == '゜') { + builder.append("le"); + i++; + } else { + builder.append("re"); + } break; case 'ロ': if (ch2 == 'ウ') { builder.append("rō"); i++; + } else if (ch2 == '゜') { + builder.append("lo"); + i++; } else { builder.append("ro"); } @@ -887,7 +913,28 @@ public class ToStringUtil { builder.append("da"); break; case 'ヂ': - builder.append("ji"); + // TODO: investigate all this + if (ch2 == 'ョ' && ch3 == 'ウ') { + builder.append("jō"); + i += 2; + } else if (ch2 == 'ュ' && ch3 == 'ウ') { + builder.append("jū"); + i += 2; + } else if (ch2 == 'ャ') { + builder.append("ja"); + i++; + } else if (ch2 == 'ョ') { + builder.append("jo"); + i++; + } else if (ch2 == 'ュ') { + builder.append("ju"); + i++; + } else if (ch2 == 'ェ') { + builder.append("je"); + i++; + } else { + builder.append("ji"); + } break; case 'ヅ': builder.append("zu"); @@ -994,6 +1041,18 @@ public class ToStringUtil { builder.append("po"); } break; + case 'ヷ': + builder.append("va"); + break; + case 'ヸ': + builder.append("vi"); + break; + case 'ヹ': + builder.append("ve"); + break; + case 'ヺ': + builder.append("vo"); + break; case 'ヴ': if (ch2 == 'ィ' && ch3 == 'ェ') { builder.append("vye"); diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/util/TestToStringUtil.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/util/TestToStringUtil.java index f95a527dcb9..a2388d7c03c 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/util/TestToStringUtil.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/util/TestToStringUtil.java @@ -17,6 +17,9 @@ package org.apache.lucene.analysis.ja.util; * limitations under the License. */ +import java.util.HashMap; +import java.util.Map; + import org.apache.lucene.util.LuceneTestCase; public class TestToStringUtil extends LuceneTestCase { @@ -31,4 +34,79 @@ public class TestToStringUtil extends LuceneTestCase { assertEquals("chashu", ToStringUtil.getRomanization("チャーシュー")); assertEquals("shumai", ToStringUtil.getRomanization("シューマイ")); } + + // see http://en.wikipedia.org/wiki/Hepburn_romanization, + // but this isnt even thorough or really probably what we want! + public void testHepburnTable() { + Map table = new HashMap() {{ + put("ア", "a"); put("イ", "i"); put("ウ", "u"); put("エ", "e"); put("オ", "o"); + put("カ", "ka"); put("キ", "ki"); put("ク", "ku"); put("ケ", "ke"); put("コ", "ko"); + put("サ", "sa"); put("シ", "shi"); put("ス", "su"); put("セ", "se"); put("ソ", "so"); + put("タ", "ta"); put("チ", "chi"); put("ツ", "tsu"); put("テ", "te"); put("ト", "to"); + put("ナ", "na"); put("ニ", "ni"); put("ヌ", "nu"); put("ネ", "ne"); put("ノ", "no"); + put("ハ", "ha"); put("ヒ", "hi"); put("フ", "fu"); put("ヘ", "he"); put("ホ", "ho"); + put("マ", "ma"); put("ミ", "mi"); put("ム", "mu"); put("メ", "me"); put("モ", "mo"); + put("ヤ", "ya"); put("ユ", "yu"); put("ヨ", "yo"); + put("ラ", "ra"); put("リ", "ri"); put("ル", "ru"); put("レ", "re"); put("ロ", "ro"); + put("ワ", "wa"); put("ヰ", "i"); put("ヱ", "e"); put("ヲ", "o"); + put("ン", "n"); + put("ガ", "ga"); put("ギ", "gi"); put("グ", "gu"); put("ゲ", "ge"); put("ゴ", "go"); + put("ザ", "za"); put("ジ", "ji"); put("ズ", "zu"); put("ゼ", "ze"); put("ゾ", "zo"); + put("ダ", "da"); put("ヂ", "ji"); put("ヅ", "zu"); put("デ", "de"); put("ド", "do"); + put("バ", "ba"); put("ビ", "bi"); put("ブ", "bu"); put("ベ", "be"); put("ボ", "bo"); + put("パ", "pa"); put("ピ", "pi"); put("プ", "pu"); put("ペ", "pe"); put("ポ", "po"); + + put("キャ", "kya"); put("キュ", "kyu"); put("キョ", "kyo"); + put("シャ", "sha"); put("シュ", "shu"); put("ショ", "sho"); + put("チャ", "cha"); put("チュ", "chu"); put("チョ", "cho"); + put("ニャ", "nya"); put("ニュ", "nyu"); put("ニョ", "nyo"); + put("ヒャ", "hya"); put("ヒュ", "hyu"); put("ヒョ", "hyo"); + put("ミャ", "mya"); put("ミュ", "myu"); put("ミョ", "myo"); + put("リャ", "rya"); put("リュ", "ryu"); put("リョ", "ryo"); + put("ギャ", "gya"); put("ギュ", "gyu"); put("ギョ", "gyo"); + put("ジャ", "ja"); put("ジュ", "ju"); put("ジョ", "jo"); + put("ヂャ", "ja"); put("ヂュ", "ju"); put("ヂョ", "jo"); + put("ビャ", "bya"); put("ビュ", "byu"); put("ビョ", "byo"); + put("ピャ", "pya"); put("ピュ", "pyu"); put("ピョ", "pyo"); + + put("イィ", "yi"); put("イェ", "ye"); + put("ウァ", "wa"); put("ウィ", "wi"); put("ウゥ", "wu"); put("ウェ", "we"); put("ウォ", "wo"); + put("ウュ", "wyu"); + // TODO: really should be vu + put("ヴァ", "va"); put("ヴィ", "vi"); put("ヴ", "v"); put("ヴェ", "ve"); put("ヴォ", "vo"); + put("ヴャ", "vya"); put("ヴュ", "vyu"); put("ヴィェ", "vye"); put("ヴョ", "vyo"); + put("キェ", "kye"); + put("ギェ", "gye"); + put("クァ", "kwa"); put("クィ", "kwi"); put("クェ", "kwe"); put("クォ", "kwo"); + put("クヮ", "kwa"); + put("グァ", "gwa"); put("グィ", "gwi"); put("グェ", "gwe"); put("グォ", "gwo"); + put("グヮ", "gwa"); + put("シェ", "she"); + put("ジェ", "je"); + put("スィ", "si"); + put("ズィ", "zi"); + put("チェ", "che"); + put("ツァ", "tsa"); put("ツィ", "tsi"); put("ツェ", "tse"); put("ツォ", "tso"); + put("ツュ", "tsyu"); + put("ティ", "ti"); put("トゥ", "tu"); + put("テュ", "tyu"); + put("ディ", "di"); put("ドゥ", "du"); + put("デュ", "dyu"); + put("ニェ", "nye"); + put("ヒェ", "hye"); + put("ビェ", "bye"); + put("ピェ", "pye"); + put("ファ", "fa"); put("フィ", "fi"); put("フェ", "fe"); put("フォ", "fo"); + put("フャ", "fya"); put("フュ", "fyu"); put("フィェ", "fye"); put("フョ", "fyo"); + put("ホゥ", "hu"); + put("ミェ", "mye"); + put("リェ", "rye"); + put("ラ゜", "la"); put("リ゜", "li"); put("ル゜", "lu"); put("レ゜", "le"); put("ロ゜", "lo"); + put("ヷ", "va"); put("ヸ", "vi"); put("ヹ", "ve"); put("ヺ", "vo"); + }}; + + for (String s : table.keySet()) { + assertEquals(s, table.get(s), ToStringUtil.getRomanization(s)); + } + } }