discourse/lib/tiny_japanese_segmenter.rb

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1492 lines
30 KiB
Ruby
Raw Permalink Normal View History

# frozen_string_literal: true
# Ruby port of http://chasen.org/~taku/software/TinySegmenter/tiny_segmenter-0.2.js
# This is esstentially a trained machine learning model used to segment words in Japanese.
# Discourse core uses it for "best effort" segmentation of Japanese text for search.
class TinyJapaneseSegmenter
CHARTYPE =
{
"[一二三四五六七八九十百千万億兆]" => "M",
"[一-龠々〆ヵヶ]" => "H",
"[ぁ-ん]" => "I",
"[ァ-ヴーア-ン゙ー]" => "K",
"[a-zA-Z--]" => "A",
"[0-9-]" => "N",
}.map { |pattern, value| [Regexp.compile(pattern), value] }
BIAS = -322
BC1 = { "HH" => 6, "II" => 2461, "KH" => 406, "OH" => -1378 }
BC2 = {
"AA" => -3267,
"AI" => 2744,
"AN" => -878,
"HH" => -4070,
"HM" => -1711,
"HN" => 4012,
"HO" => 3761,
"IA" => 1327,
"IH" => -1184,
"II" => -1332,
"IK" => 1721,
"IO" => 5492,
"KI" => 3831,
"KK" => -8741,
"MH" => -3132,
"MK" => 3334,
"OO" => -2920,
}
BC3 = {
"HH" => 996,
"HI" => 626,
"HK" => -721,
"HN" => -1307,
"HO" => -836,
"IH" => -301,
"KK" => 2762,
"MK" => 1079,
"MM" => 4034,
"OA" => -1652,
"OH" => 266,
}
BP1 = { "BB" => 295, "OB" => 304, "OO" => -125, "UB" => 352 }
BP2 = { "BO" => 60, "OO" => -1762 }
BQ1 = {
"BHH" => 1150,
"BHM" => 1521,
"BII" => -1158,
"BIM" => 886,
"BMH" => 1208,
"BNH" => 449,
"BOH" => -91,
"BOO" => -2597,
"OHI" => 451,
"OIH" => -296,
"OKA" => 1851,
"OKH" => -1020,
"OKK" => 904,
"OOO" => 2965,
}
BQ2 = {
"BHH" => 118,
"BHI" => -1159,
"BHM" => 466,
"BIH" => -919,
"BKK" => -1720,
"BKO" => 864,
"OHH" => -1139,
"OHM" => -181,
"OIH" => 153,
"UHI" => -1146,
}
BQ3 = {
"BHH" => -792,
"BHI" => 2664,
"BII" => -299,
"BKI" => 419,
"BMH" => 937,
"BMM" => 8335,
"BNN" => 998,
"BOH" => 775,
"OHH" => 2174,
"OHM" => 439,
"OII" => 280,
"OKH" => 1798,
"OKI" => -793,
"OKO" => -2242,
"OMH" => -2402,
"OOO" => 11_699,
}
BQ4 = {
"BHH" => -3895,
"BIH" => 3761,
"BII" => -4654,
"BIK" => 1348,
"BKK" => -1806,
"BMI" => -3385,
"BOO" => -12_396,
"OAH" => 926,
"OHH" => 266,
"OHK" => -2036,
"ONN" => -973,
}
BW1 = {
",と" => 660,
",同" => 727,
"B1あ" => 1404,
"B1同" => 542,
"、と" => 660,
"、同" => 727,
"」と" => 1682,
"あっ" => 1505,
"いう" => 1743,
"いっ" => -2055,
"いる" => 672,
"うし" => -4817,
"うん" => 665,
"から" => 3472,
"がら" => 600,
"こう" => -790,
"こと" => 2083,
"こん" => -1262,
"さら" => -4143,
"さん" => 4573,
"した" => 2641,
"して" => 1104,
"すで" => -3399,
"そこ" => 1977,
"それ" => -871,
"たち" => 1122,
"ため" => 601,
"った" => 3463,
"つい" => -802,
"てい" => 805,
"てき" => 1249,
"でき" => 1127,
"です" => 3445,
"では" => 844,
"とい" => -4915,
"とみ" => 1922,
"どこ" => 3887,
"ない" => 5713,
"なっ" => 3015,
"など" => 7379,
"なん" => -1113,
"にし" => 2468,
"には" => 1498,
"にも" => 1671,
"に対" => -912,
"の一" => -501,
"の中" => 741,
"ませ" => 2448,
"まで" => 1711,
"まま" => 2600,
"まる" => -2155,
"やむ" => -1947,
"よっ" => -2565,
"れた" => 2369,
"れで" => -913,
"をし" => 1860,
"を見" => 731,
"亡く" => -1886,
"京都" => 2558,
"取り" => -2784,
"大き" => -2604,
"大阪" => 1497,
"平方" => -2314,
"引き" => -1336,
"日本" => -195,
"本当" => -2423,
"毎日" => -2113,
"目指" => -724,
"B1あ" => 1404,
"B1同" => 542,
"」と" => 1682,
}
BW2 = {
".." => -11_822,
"11" => -669,
"――" => -5730,
"" => -13_175,
"いう" => -1609,
"うか" => 2490,
"かし" => -1350,
"かも" => -602,
"から" => -7194,
"かれ" => 4612,
"がい" => 853,
"がら" => -3198,
"きた" => 1941,
"くな" => -1597,
"こと" => -8392,
"この" => -4193,
"させ" => 4533,
"され" => 13_168,
"さん" => -3977,
"しい" => -1819,
"しか" => -545,
"した" => 5078,
"して" => 972,
"しな" => 939,
"その" => -3744,
"たい" => -1253,
"たた" => -662,
"ただ" => -3857,
"たち" => -786,
"たと" => 1224,
"たは" => -939,
"った" => 4589,
"って" => 1647,
"っと" => -2094,
"てい" => 6144,
"てき" => 3640,
"てく" => 2551,
"ては" => -3110,
"ても" => -3065,
"でい" => 2666,
"でき" => -1528,
"でし" => -3828,
"です" => -4761,
"でも" => -4203,
"とい" => 1890,
"とこ" => -1746,
"とと" => -2279,
"との" => 720,
"とみ" => 5168,
"とも" => -3941,
"ない" => -2488,
"なが" => -1313,
"など" => -6509,
"なの" => 2614,
"なん" => 3099,
"にお" => -1615,
"にし" => 2748,
"にな" => 2454,
"によ" => -7236,
"に対" => -14_943,
"に従" => -4688,
"に関" => -11_388,
"のか" => 2093,
"ので" => -7059,
"のに" => -6041,
"のの" => -6125,
"はい" => 1073,
"はが" => -1033,
"はず" => -2532,
"ばれ" => 1813,
"まし" => -1316,
"まで" => -6621,
"まれ" => 5409,
"めて" => -3153,
"もい" => 2230,
"もの" => -10_713,
"らか" => -944,
"らし" => -1611,
"らに" => -1897,
"りし" => 651,
"りま" => 1620,
"れた" => 4270,
"れて" => 849,
"れば" => 4114,
"ろう" => 6067,
"われ" => 7901,
"を通" => -11_877,
"んだ" => 728,
"んな" => -4115,
"一人" => 602,
"一方" => -1375,
"一日" => 970,
"一部" => -1051,
"上が" => -4479,
"会社" => -1116,
"出て" => 2163,
"分の" => -7758,
"同党" => 970,
"同日" => -913,
"大阪" => -2471,
"委員" => -1250,
"少な" => -1050,
"年度" => -8669,
"年間" => -1626,
"府県" => -2363,
"手権" => -1982,
"新聞" => -4066,
"日新" => -722,
"日本" => -7068,
"日米" => 3372,
"曜日" => -601,
"朝鮮" => -2355,
"本人" => -2697,
"東京" => -1543,
"然と" => -1384,
"社会" => -1276,
"立て" => -990,
"第に" => -1612,
"米国" => -4268,
"" => -669,
}
BW3 = {
"あた" => -2194,
"あり" => 719,
"ある" => 3846,
"い." => -1185,
"い。" => -1185,
"いい" => 5308,
"いえ" => 2079,
"いく" => 3029,
"いた" => 2056,
"いっ" => 1883,
"いる" => 5600,
"いわ" => 1527,
"うち" => 1117,
"うと" => 4798,
"えと" => 1454,
"か." => 2857,
"か。" => 2857,
"かけ" => -743,
"かっ" => -4098,
"かに" => -669,
"から" => 6520,
"かり" => -2670,
"が," => 1816,
"が、" => 1816,
"がき" => -4855,
"がけ" => -1127,
"がっ" => -913,
"がら" => -4977,
"がり" => -2064,
"きた" => 1645,
"けど" => 1374,
"こと" => 7397,
"この" => 1542,
"ころ" => -2757,
"さい" => -714,
"さを" => 976,
"し," => 1557,
"し、" => 1557,
"しい" => -3714,
"した" => 3562,
"して" => 1449,
"しな" => 2608,
"しま" => 1200,
"す." => -1310,
"す。" => -1310,
"する" => 6521,
"ず," => 3426,
"ず、" => 3426,
"ずに" => 841,
"そう" => 428,
"た." => 8875,
"た。" => 8875,
"たい" => -594,
"たの" => 812,
"たり" => -1183,
"たる" => -853,
"だ." => 4098,
"だ。" => 4098,
"だっ" => 1004,
"った" => -4748,
"って" => 300,
"てい" => 6240,
"てお" => 855,
"ても" => 302,
"です" => 1437,
"でに" => -1482,
"では" => 2295,
"とう" => -1387,
"とし" => 2266,
"との" => 541,
"とも" => -3543,
"どう" => 4664,
"ない" => 1796,
"なく" => -903,
"など" => 2135,
"に," => -1021,
"に、" => -1021,
"にし" => 1771,
"にな" => 1906,
"には" => 2644,
"の," => -724,
"の、" => -724,
"の子" => -1000,
"は," => 1337,
"は、" => 1337,
"べき" => 2181,
"まし" => 1113,
"ます" => 6943,
"まっ" => -1549,
"まで" => 6154,
"まれ" => -793,
"らし" => 1479,
"られ" => 6820,
"るる" => 3818,
"れ," => 854,
"れ、" => 854,
"れた" => 1850,
"れて" => 1375,
"れば" => -3246,
"れる" => 1091,
"われ" => -605,
"んだ" => 606,
"んで" => 798,
"カ月" => 990,
"会議" => 860,
"入り" => 1232,
"大会" => 2217,
"始め" => 1681,
"" => 965,
"新聞" => -5055,
"日," => 974,
"日、" => 974,
"社会" => 2024,
"カ月" => 990,
}
TC1 = {
"AAA" => 1093,
"HHH" => 1029,
"HHM" => 580,
"HII" => 998,
"HOH" => -390,
"HOM" => -331,
"IHI" => 1169,
"IOH" => -142,
"IOI" => -1015,
"IOM" => 467,
"MMH" => 187,
"OOI" => -1832,
}
TC2 = {
"HHO" => 2088,
"HII" => -1023,
"HMM" => -1154,
"IHI" => -1965,
"KKH" => 703,
"OII" => -2649,
}
TC3 = {
"AAA" => -294,
"HHH" => 346,
"HHI" => -341,
"HII" => -1088,
"HIK" => 731,
"HOH" => -1486,
"IHH" => 128,
"IHI" => -3041,
"IHO" => -1935,
"IIH" => -825,
"IIM" => -1035,
"IOI" => -542,
"KHH" => -1216,
"KKA" => 491,
"KKH" => -1217,
"KOK" => -1009,
"MHH" => -2694,
"MHM" => -457,
"MHO" => 123,
"MMH" => -471,
"NNH" => -1689,
"NNO" => 662,
"OHO" => -3393,
}
TC4 = {
"HHH" => -203,
"HHI" => 1344,
"HHK" => 365,
"HHM" => -122,
"HHN" => 182,
"HHO" => 669,
"HIH" => 804,
"HII" => 679,
"HOH" => 446,
"IHH" => 695,
"IHO" => -2324,
"IIH" => 321,
"III" => 1497,
"IIO" => 656,
"IOO" => 54,
"KAK" => 4845,
"KKA" => 3386,
"KKK" => 3065,
"MHH" => -405,
"MHI" => 201,
"MMH" => -241,
"MMM" => 661,
"MOM" => 841,
}
TQ1 = {
"BHHH" => -227,
"BHHI" => 316,
"BHIH" => -132,
"BIHH" => 60,
"BIII" => 1595,
"BNHH" => -744,
"BOHH" => 225,
"BOOO" => -908,
"OAKK" => 482,
"OHHH" => 281,
"OHIH" => 249,
"OIHI" => 200,
"OIIH" => -68,
}
TQ2 = { "BIHH" => -1401, "BIII" => -1033, "BKAK" => -543, "BOOO" => -5591 }
TQ3 = {
"BHHH" => 478,
"BHHM" => -1073,
"BHIH" => 222,
"BHII" => -504,
"BIIH" => -116,
"BIII" => -105,
"BMHI" => -863,
"BMHM" => -464,
"BOMH" => 620,
"OHHH" => 346,
"OHHI" => 1729,
"OHII" => 997,
"OHMH" => 481,
"OIHH" => 623,
"OIIH" => 1344,
"OKAK" => 2792,
"OKHH" => 587,
"OKKA" => 679,
"OOHH" => 110,
"OOII" => -685,
}
TQ4 = {
"BHHH" => -721,
"BHHM" => -3604,
"BHII" => -966,
"BIIH" => -607,
"BIII" => -2181,
"OAAA" => -2763,
"OAKK" => 180,
"OHHH" => -294,
"OHHI" => 2446,
"OHHO" => 480,
"OHIH" => -1573,
"OIHH" => 1935,
"OIHI" => -493,
"OIIH" => 626,
"OIII" => -4007,
"OKAK" => -8156,
}
TW1 = { "につい" => -4681, "東京都" => 2026 }
TW2 = {
"ある程" => -2049,
"いった" => -1256,
"ころが" => -2434,
"しょう" => 3873,
"その後" => -4430,
"だって" => -1049,
"ていた" => 1833,
"として" => -4657,
"ともに" => -4517,
"もので" => 1882,
"一気に" => -792,
"初めて" => -1512,
"同時に" => -8097,
"大きな" => -1255,
"対して" => -2721,
"社会党" => -3216,
}
TW3 = {
"いただ" => -1734,
"してい" => 1314,
"として" => -4314,
"につい" => -5483,
"にとっ" => -5989,
"に当た" => -6247,
"ので," => -727,
"ので、" => -727,
"のもの" => -600,
"れから" => -3752,
"十二月" => -2287,
}
TW4 = {
"いう." => 8576,
"いう。" => 8576,
"からな" => -2348,
"してい" => 2958,
"たが," => 1516,
"たが、" => 1516,
"ている" => 1538,
"という" => 1349,
"ました" => 5543,
"ません" => 1097,
"ようと" => -4258,
"よると" => 5865,
}
UC1 = { "A" => 484, "K" => 93, "M" => 645, "O" => -505 }
UC2 = { "A" => 819, "H" => 1059, "I" => 409, "M" => 3987, "N" => 5775, "O" => 646 }
UC3 = { "A" => -1370, "I" => 2311 }
UC4 = {
"A" => -2643,
"H" => 1809,
"I" => -1032,
"K" => -3450,
"M" => 3565,
"N" => 3876,
"O" => 6646,
}
UC5 = { "H" => 313, "I" => -1238, "K" => -799, "M" => 539, "O" => -831 }
UC6 = { "H" => -506, "I" => -253, "K" => 87, "M" => 247, "O" => -387 }
UP1 = { "O" => -214 }
UP2 = { "B" => 69, "O" => 935 }
UP3 = { "B" => 189 }
UQ1 = {
"BH" => 21,
"BI" => -12,
"BK" => -99,
"BN" => 142,
"BO" => -56,
"OH" => -95,
"OI" => 477,
"OK" => 410,
"OO" => -2422,
}
UQ2 = { "BH" => 216, "BI" => 113, "OK" => 1759 }
UQ3 = {
"BA" => -479,
"BH" => 42,
"BI" => 1913,
"BK" => -7198,
"BM" => 3160,
"BN" => 6427,
"BO" => 14_761,
"OI" => -827,
"ON" => -3212,
}
UW1 = {
"," => 156,
"" => 156,
"" => -463,
"" => -941,
"" => -127,
"" => -553,
"" => 121,
"" => 505,
"" => -201,
"" => -547,
"" => -123,
"" => -789,
"" => -185,
"" => -847,
"" => -466,
"" => -470,
"" => 182,
"" => -292,
"" => 208,
"" => 169,
"" => -446,
"" => -137,
"" => -135,
"" => -402,
"" => -268,
"" => -912,
"" => 871,
"" => -460,
"" => 561,
"" => 729,
"" => -411,
"" => -141,
"" => 361,
"" => -408,
"" => -386,
"" => -718,
"" => -463,
"" => -135,
}
UW2 = {
"," => -829,
"" => -829,
"" => 892,
"" => -645,
"" => 3145,
"" => -538,
"" => 505,
"" => 134,
"" => -502,
"" => 1454,
"" => -856,
"" => -412,
"" => 1141,
"" => 878,
"" => 540,
"" => 1529,
"" => -675,
"" => 300,
"" => -1011,
"" => 188,
"" => 1837,
"" => -949,
"" => -291,
"" => -268,
"" => -981,
"" => 1273,
"" => 1063,
"" => -1764,
"" => 130,
"" => -409,
"" => -1273,
"" => 1261,
"" => 600,
"" => -1263,
"" => -402,
"" => 1639,
"" => -579,
"" => -694,
"" => 571,
"" => -2516,
"" => 2095,
"" => -587,
"" => 306,
"" => 568,
"" => 831,
"" => -758,
"" => -2150,
"" => -302,
"" => -968,
"" => -861,
"" => 492,
"" => -123,
"" => 978,
"" => 362,
"" => 548,
"" => -3025,
"" => -1566,
"" => -3414,
"" => -422,
"" => -1769,
"" => -865,
"" => -483,
"" => -1519,
"" => 760,
"" => 1023,
"" => -2009,
"" => -813,
"" => -1060,
"" => 1067,
"" => -1519,
"" => -1033,
"" => 1522,
"" => -1355,
"" => -1682,
"" => -1815,
"" => -1462,
"" => -630,
"" => -1843,
"" => -1650,
"" => -931,
"" => -665,
"" => -2378,
"" => -180,
"" => -1740,
"" => 752,
"" => 529,
"" => -1584,
"" => -242,
"" => -1165,
"" => -763,
"" => 810,
"" => 509,
"" => -1353,
"" => 838,
"西" => -744,
"" => -3874,
"調" => 1010,
"" => 1198,
"" => 3041,
"" => 1758,
"" => -1257,
"" => -645,
"" => 3145,
"" => 831,
"" => -587,
"" => 306,
"" => 568,
}
UW3 = {
"," => 4889,
"1" => -800,
"" => -1723,
"" => 4889,
"" => -2311,
"" => 5827,
"" => 2670,
"" => -3573,
"" => -2696,
"" => 1006,
"" => 2342,
"" => 1983,
"" => -4864,
"" => -1163,
"" => 3271,
"" => 1004,
"" => 388,
"" => 401,
"" => -3552,
"" => -3116,
"" => -1058,
"" => -395,
"" => 584,
"" => 3685,
"" => -5228,
"" => 842,
"" => -521,
"" => -1444,
"" => -1081,
"" => 6167,
"" => 2318,
"" => 1691,
"" => -899,
"" => -2788,
"" => 2745,
"" => 4056,
"" => 4555,
"" => -2171,
"" => -1798,
"" => 1199,
"" => -5516,
"" => -4384,
"" => -120,
"" => 1205,
"" => 2323,
"" => -788,
"" => -202,
"" => 727,
"" => 649,
"" => 5905,
"" => 2773,
"" => -1207,
"" => 6620,
"" => -518,
"" => 551,
"" => 1319,
"" => 874,
"" => -1350,
"" => 521,
"" => 1109,
"" => 1591,
"" => 2201,
"" => 278,
"" => -3794,
"" => -1619,
"" => -1759,
"" => -2087,
"" => 3815,
"" => 653,
"" => -758,
"" => -1193,
"" => 974,
"" => 2742,
"" => 792,
"" => 1889,
"" => -1368,
"" => 811,
"" => 4265,
"" => -361,
"" => -2439,
"" => 4858,
"" => 3593,
"" => 1574,
"" => -3030,
"" => 755,
"" => -1880,
"" => 5807,
"" => 3095,
"" => 457,
"" => 2475,
"" => 1129,
"" => 2286,
"" => 4437,
"" => 365,
"" => -949,
"" => -1872,
"" => 1327,
"" => -1038,
"" => 4646,
"" => -2309,
"" => -783,
"" => -1006,
"" => 483,
"" => 1233,
"" => 3588,
"" => -241,
"" => 3906,
"" => -837,
"" => 4513,
"" => 642,
"" => 1389,
"" => 1219,
"" => -241,
"" => 2016,
"" => -1356,
"" => -423,
"" => -1008,
"" => 1078,
"" => -513,
"" => -3102,
"" => 1155,
"" => 3197,
"" => -1804,
"" => 2416,
"" => -1030,
"" => 1605,
"" => 1452,
"" => -2352,
"" => -3885,
"" => 1905,
"" => -1291,
"" => 1822,
"" => -488,
"" => -3973,
"" => -2013,
"" => -1479,
"" => 3222,
"" => -1489,
"" => 1764,
"" => 2099,
"" => 5792,
"" => -661,
"" => -1248,
"" => -951,
"" => -937,
"" => 4125,
"" => 360,
"" => 3094,
"" => 364,
"" => -805,
"" => 5156,
"" => 2438,
"" => 484,
"" => 2613,
"" => -1694,
"" => -1073,
"" => 1868,
"" => -495,
"" => 979,
"" => 461,
"" => -3850,
"" => -273,
"" => 914,
"" => 1215,
"" => 7313,
"" => -1835,
"" => 792,
"" => 6293,
"" => -1528,
"" => 4231,
"" => 401,
"" => -960,
"" => 1201,
"" => 7767,
"" => 3066,
"" => 3663,
"" => 1384,
"" => -4229,
"" => 1163,
"" => 1255,
"" => 6457,
"" => 725,
"" => -2869,
"" => 785,
"" => 1044,
"調" => -562,
"" => -733,
"" => 1777,
"" => 1835,
"" => 1375,
"" => -1504,
"" => -1136,
"" => -681,
"" => 1026,
"" => 4404,
"" => 1200,
"" => 2163,
"" => 421,
"" => -1432,
"" => 1302,
"" => -1282,
"" => 2009,
"" => -1045,
"" => 2066,
"" => 1620,
"" => -800,
"" => 2670,
"" => -3794,
"" => -1350,
"" => 551,
"グ" => 1319,
"" => 874,
"" => 521,
"" => 1109,
"" => 1591,
"" => 2201,
"" => 278,
}
UW4 = {
"," => 3930,
"." => 3508,
"" => -4841,
"" => 3930,
"" => 3508,
"" => 4999,
"" => 1895,
"" => 3798,
"" => -5156,
"" => 4752,
"" => -3435,
"" => -640,
"" => -2514,
"" => 2405,
"" => 530,
"" => 6006,
"" => -4482,
"" => -3821,
"" => -3788,
"" => -4376,
"" => -4734,
"" => 2255,
"" => 1979,
"" => 2864,
"" => -843,
"" => -2506,
"" => -731,
"" => 1251,
"" => 181,
"" => 4091,
"" => 5034,
"" => 5408,
"" => -3654,
"" => -5882,
"" => -1659,
"" => 3994,
"" => 7410,
"" => 4547,
"" => 5433,
"" => 6499,
"" => 1853,
"" => 1413,
"" => 7396,
"" => 8578,
"" => 1940,
"" => 4249,
"" => -4134,
"" => 1345,
"" => 6665,
"" => -744,
"" => 1464,
"" => 1051,
"" => -2082,
"" => -882,
"" => -5046,
"" => 4169,
"" => -2666,
"" => 2795,
"" => -1544,
"" => 3351,
"" => -2922,
"" => -9726,
"" => -14_896,
"" => -2613,
"" => -4570,
"" => -1783,
"" => 13_150,
"" => -2352,
"" => 2145,
"" => 1789,
"" => 1287,
"" => -724,
"" => -403,
"" => -1635,
"" => -881,
"" => -541,
"" => -856,
"" => -3637,
"" => -4371,
"" => -11_870,
"" => -2069,
"" => 2210,
"" => 782,
"" => -190,
"" => -1768,
"" => 1036,
"" => 544,
"" => 950,
"" => -1286,
"" => 530,
"" => 4292,
"" => 601,
"" => -2006,
"" => -1212,
"" => 584,
"" => 788,
"" => 1347,
"" => 1623,
"" => 3879,
"" => -302,
"" => -740,
"" => -2715,
"" => 776,
"" => 4517,
"" => 1013,
"" => 1555,
"" => -1834,
"" => -681,
"" => -910,
"" => -851,
"" => 1500,
"" => -619,
"" => -1200,
"" => 866,
"" => -1410,
"" => -2094,
"" => -1413,
"" => 1067,
"" => 571,
"" => -4802,
"" => -1397,
"" => -1057,
"" => -809,
"" => 1910,
"" => -1328,
"" => -1500,
"" => -2056,
"" => -2667,
"" => 2771,
"" => 374,
"" => -4556,
"" => 456,
"" => 553,
"" => 916,
"" => -1566,
"" => 856,
"" => 787,
"" => 2182,
"" => 704,
"" => 522,
"" => -856,
"" => 1798,
"" => 1829,
"" => 845,
"" => -9066,
"" => -485,
"" => -442,
"" => -360,
"" => -1043,
"" => 5388,
"" => -2716,
"" => -910,
"" => -939,
"" => -543,
"" => -735,
"" => 672,
"" => -1267,
"" => -1286,
"" => -1101,
"" => -2900,
"" => 1826,
"" => 2586,
"" => 922,
"" => -3485,
"" => 2997,
"" => -867,
"" => -2112,
"" => 788,
"" => 2937,
"" => 786,
"" => 2171,
"" => 1146,
"" => -1169,
"" => 940,
"" => -994,
"" => 749,
"" => 2145,
"" => -730,
"" => -852,
"" => -792,
"" => 792,
"" => -1184,
"" => -244,
"" => -1000,
"" => 730,
"" => -1481,
"" => 1158,
"" => -1433,
"" => -3370,
"" => 929,
"" => -1291,
"" => 2596,
"" => -4866,
"" => 1192,
"" => -1100,
"" => -2213,
"" => 357,
"" => -2344,
"" => -2297,
"" => -2604,
"" => -878,
"" => -1659,
"" => -792,
"" => -1984,
"" => 1749,
"" => 2120,
"" => 1895,
"" => 3798,
"" => -4371,
"" => -724,
"" => -11_870,
"" => 2145,
"" => 1789,
"" => 1287,
"" => -403,
"" => -1635,
"" => -881,
"" => -541,
"" => -856,
"" => -3637,
}
UW5 = {
"," => 465,
"." => -299,
"1" => -514,
"E2" => -32_768,
"]" => -2762,
"" => 465,
"" => -299,
"" => 363,
"" => 1655,
"" => 331,
"" => -503,
"" => 1199,
"" => 527,
"" => 647,
"" => -421,
"" => 1624,
"" => 1971,
"" => 312,
"" => -983,
"" => -1537,
"" => -1371,
"" => -852,
"" => -1186,
"" => 1093,
"" => 52,
"" => 921,
"" => -18,
"" => -850,
"" => -127,
"" => 1682,
"" => -787,
"" => -1224,
"" => -635,
"" => -578,
"" => 1001,
"" => 502,
"" => 865,
"" => 3350,
"" => 854,
"" => -208,
"" => 429,
"" => 504,
"" => 419,
"" => -1264,
"" => 327,
"" => 241,
"" => 451,
"" => -343,
"" => -871,
"" => 722,
"" => -1153,
"" => -654,
"" => 3519,
"" => -901,
"" => 848,
"" => 2104,
"" => -1296,
"" => -548,
"" => 1785,
"" => -1304,
"" => -2991,
"" => 921,
"" => 1763,
"" => 872,
"" => -814,
"" => 1618,
"" => -1682,
"" => 218,
"" => -4353,
"" => 932,
"" => 1356,
"" => -1508,
"" => -1347,
"" => 240,
"" => -3912,
"" => -3149,
"" => 1319,
"" => -1052,
"" => -4003,
"" => -997,
"" => -278,
"" => -813,
"" => 1955,
"" => -2233,
"" => 663,
"" => -1073,
"" => 1219,
"" => -1018,
"" => -368,
"" => 786,
"" => 1191,
"" => 2368,
"" => -689,
"" => -514,
"" => -32_768,
"" => 363,
"" => 241,
"" => 451,
"" => -343,
}
UW6 = {
"," => 227,
"." => 808,
"1" => -270,
"E1" => 306,
"" => 227,
"" => 808,
"" => -307,
"" => 189,
"" => 241,
"" => -73,
"" => -121,
"" => -200,
"" => 1782,
"" => 383,
"" => -428,
"" => 573,
"" => -1014,
"" => 101,
"" => -105,
"" => -253,
"" => -149,
"" => -417,
"" => -236,
"" => -206,
"" => 187,
"" => -135,
"" => 195,
"" => -673,
"" => -496,
"" => -277,
"" => 201,
"" => -800,
"" => 624,
"" => 302,
"" => 1792,
"" => -1212,
"" => 798,
"" => -960,
"" => 887,
"" => -695,
"" => 535,
"" => -697,
"" => 753,
"" => -507,
"" => 974,
"" => -822,
"" => 1811,
"" => 463,
"" => 1082,
"" => -270,
"" => 306,
"" => -673,
"" => -496,
}
class << self
def segment(text)
return [] if text.nil? || text.strip.length == 0
result = []
segments = %w[B3 B2 B1]
ctypes = %w[O O O]
text.chars.each do |char|
segments << char
ctypes << ctype(char)
end
segments.concat(%w[E1 E2 E3])
ctypes.concat(%w[O O O])
word = segments[3]
p1 = "U"
p2 = "U"
p3 = "U"
4.upto(segments.size - 4) do |i|
score = BIAS
w1 = segments[i - 3]
w2 = segments[i - 2]
w3 = segments[i - 1]
w4 = segments[i]
w5 = segments[i + 1]
w6 = segments[i + 2]
c1 = ctypes[i - 3]
c2 = ctypes[i - 2]
c3 = ctypes[i - 1]
c4 = ctypes[i]
c5 = ctypes[i + 1]
c6 = ctypes[i + 2]
score += UP1[p1].to_i
score += UP2[p2].to_i
score += UP3[p3].to_i
score += BP1[p1 + p2].to_i
score += BP2[p2 + p3].to_i
score += UW1[w1].to_i
score += UW2[w2].to_i
score += UW3[w3].to_i
score += UW4[w4].to_i
score += UW5[w5].to_i
score += UW6[w6].to_i
score += BW1[w2 + w3].to_i
score += BW2[w3 + w4].to_i
score += BW3[w4 + w5].to_i
score += TW1[w1 + w2 + w3].to_i
score += TW2[w2 + w3 + w4].to_i
score += TW3[w3 + w4 + w5].to_i
score += TW4[w4 + w5 + w6].to_i
score += UC1[c1].to_i
score += UC2[c2].to_i
score += UC3[c3].to_i
score += UC4[c4].to_i
score += UC5[c5].to_i
score += UC6[c6].to_i
score += BC1[c2 + c3].to_i
score += BC2[c3 + c4].to_i
score += BC3[c4 + c5].to_i
score += TC1[c1 + c2 + c3].to_i
score += TC2[c2 + c3 + c4].to_i
score += TC3[c3 + c4 + c5].to_i
score += TC4[c4 + c5 + c6].to_i
# score += TC5[c4 + c5 + c6].to_i
score += UQ1[p1 + c1].to_i
score += UQ2[p2 + c2].to_i
score += UQ3[p3 + c3].to_i
score += BQ1[p2 + c2 + c3].to_i
score += BQ2[p2 + c3 + c4].to_i
score += BQ3[p3 + c2 + c3].to_i
score += BQ4[p3 + c3 + c4].to_i
score += TQ1[p2 + c1 + c2 + c3].to_i
score += TQ2[p2 + c2 + c3 + c4].to_i
score += TQ3[p3 + c1 + c2 + c3].to_i
score += TQ4[p3 + c2 + c3 + c4].to_i
p = "O"
if score > 0
result.push(word)
word = ""
p = "B"
end
p1 = p2
p2 = p3
p3 = p
word += segments[i]
end
result.push(word)
result
end
private
def ctype(text)
CHARTYPE.each { |regexp, value| return value if text.match(regexp) }
"O"
end
end
end