mirror of https://github.com/apache/lucene.git
LUCENE-5995: upgrade to ICU 54.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629799 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
f6cbe636bb
|
@ -221,6 +221,8 @@ Build
|
||||||
* LUCENE-5962: Rename diffSources.py to createPatch.py and make it work with all text file types.
|
* LUCENE-5962: Rename diffSources.py to createPatch.py and make it work with all text file types.
|
||||||
(Ryan Ernst)
|
(Ryan Ernst)
|
||||||
|
|
||||||
|
* LUCENE-5995: Upgrade ICU to 54.1 (Robert Muir)
|
||||||
|
|
||||||
Other
|
Other
|
||||||
|
|
||||||
* LUCENE-5563: Removed sep layout: which has fallen behind on features and doesn't
|
* LUCENE-5563: Removed sep layout: which has fallen behind on features and doesn't
|
||||||
|
|
|
@ -1,50 +0,0 @@
|
||||||
#
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright ownership.
|
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# (the "License"); you may not use this file except in compliance with
|
|
||||||
# the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Parses Myanmar text, with syllable as token.
|
|
||||||
#
|
|
||||||
|
|
||||||
$Cons = [[:Other_Letter:]&[:Myanmar:]];
|
|
||||||
$Virama = [\u1039];
|
|
||||||
$Asat = [\u103A];
|
|
||||||
|
|
||||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
|
||||||
|
|
||||||
#
|
|
||||||
# default numerical definitions
|
|
||||||
#
|
|
||||||
$Extend = [\p{Word_Break = Extend}];
|
|
||||||
$Format = [\p{Word_Break = Format}];
|
|
||||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
|
||||||
$MidNum = [\p{Word_Break = MidNum}];
|
|
||||||
$Numeric = [\p{Word_Break = Numeric}];
|
|
||||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|
||||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
|
||||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
|
||||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
|
||||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
|
||||||
|
|
||||||
$ConsEx = $Cons ($Extend | $Format)*;
|
|
||||||
$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
|
|
||||||
$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
|
|
||||||
$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
|
|
||||||
|
|
||||||
!!forward;
|
|
||||||
$MyanmarJoinedSyllableEx {200};
|
|
||||||
|
|
||||||
# default numeric rules
|
|
||||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
|
@ -49,6 +49,7 @@
|
||||||
2E17>002D
|
2E17>002D
|
||||||
2E1A>002D
|
2E1A>002D
|
||||||
2E3A..2E3B>002D
|
2E3A..2E3B>002D
|
||||||
|
2E40>002D
|
||||||
301C>002D
|
301C>002D
|
||||||
3030>002D
|
3030>002D
|
||||||
30A0>002D
|
30A0>002D
|
||||||
|
|
|
@ -102,6 +102,7 @@
|
||||||
1939..193B>
|
1939..193B>
|
||||||
1A75..1A7C>
|
1A75..1A7C>
|
||||||
1A7F>
|
1A7F>
|
||||||
|
1AB0..1ABD>
|
||||||
1B34>
|
1B34>
|
||||||
1B44>
|
1B44>
|
||||||
1B6B..1B73>
|
1B6B..1B73>
|
||||||
|
@ -111,8 +112,10 @@
|
||||||
1CD0..1CE8>
|
1CD0..1CE8>
|
||||||
1CED>
|
1CED>
|
||||||
1CF4>
|
1CF4>
|
||||||
|
1CF8..1CF9>
|
||||||
1D2C..1D6A>
|
1D2C..1D6A>
|
||||||
1DC4..1DCF>
|
1DC4..1DCF>
|
||||||
|
1DF5>
|
||||||
1DFD..1DFF>
|
1DFD..1DFF>
|
||||||
1FBD>
|
1FBD>
|
||||||
1FBF..1FC1>
|
1FBF..1FC1>
|
||||||
|
@ -128,6 +131,7 @@
|
||||||
A66F>
|
A66F>
|
||||||
A67C..A67D>
|
A67C..A67D>
|
||||||
A67F>
|
A67F>
|
||||||
|
A69C..A69D>
|
||||||
A6F0..A6F1>
|
A6F0..A6F1>
|
||||||
A717..A721>
|
A717..A721>
|
||||||
A788>
|
A788>
|
||||||
|
@ -138,27 +142,43 @@ A92B..A92E>
|
||||||
A953>
|
A953>
|
||||||
A9B3>
|
A9B3>
|
||||||
A9C0>
|
A9C0>
|
||||||
AA7B>
|
A9E5>
|
||||||
|
AA7B..AA7D>
|
||||||
AABF..AAC2>
|
AABF..AAC2>
|
||||||
AAF6>
|
AAF6>
|
||||||
|
AB5B..AB5F>
|
||||||
ABEC..ABED>
|
ABEC..ABED>
|
||||||
FB1E>
|
FB1E>
|
||||||
FE20..FE26>
|
FE20..FE2D>
|
||||||
FF3E>
|
FF3E>
|
||||||
FF40>
|
FF40>
|
||||||
FF70>
|
FF70>
|
||||||
FF9E..FF9F>
|
FF9E..FF9F>
|
||||||
FFE3>
|
FFE3>
|
||||||
|
102E0>
|
||||||
|
10AE5..10AE6>
|
||||||
110B9..110BA>
|
110B9..110BA>
|
||||||
11133..11134>
|
11133..11134>
|
||||||
|
11173>
|
||||||
111C0>
|
111C0>
|
||||||
|
11235..11236>
|
||||||
|
112E9..112EA>
|
||||||
|
1133C>
|
||||||
|
1134D>
|
||||||
|
11366..1136C>
|
||||||
|
11370..11374>
|
||||||
|
114C2..114C3>
|
||||||
|
115BF..115C0>
|
||||||
|
1163F>
|
||||||
116B6..116B7>
|
116B6..116B7>
|
||||||
|
16AF0..16AF4>
|
||||||
16F8F..16F9F>
|
16F8F..16F9F>
|
||||||
1D167..1D169>
|
1D167..1D169>
|
||||||
1D16D..1D172>
|
1D16D..1D172>
|
||||||
1D17B..1D182>
|
1D17B..1D182>
|
||||||
1D185..1D18B>
|
1D185..1D18B>
|
||||||
1D1AA..1D1AD>
|
1D1AA..1D1AD>
|
||||||
|
1E8D0..1E8D6>
|
||||||
|
|
||||||
# Latin script "composed" that do not further decompose, so decompose here
|
# Latin script "composed" that do not further decompose, so decompose here
|
||||||
# These are from AsciiFoldingFilter
|
# These are from AsciiFoldingFilter
|
||||||
|
|
|
@ -151,6 +151,16 @@
|
||||||
0D6D>0037 # MALAYALAM DIGIT SEVEN
|
0D6D>0037 # MALAYALAM DIGIT SEVEN
|
||||||
0D6E>0038 # MALAYALAM DIGIT EIGHT
|
0D6E>0038 # MALAYALAM DIGIT EIGHT
|
||||||
0D6F>0039 # MALAYALAM DIGIT NINE
|
0D6F>0039 # MALAYALAM DIGIT NINE
|
||||||
|
0DE6>0030 # SINHALA LITH DIGIT ZERO
|
||||||
|
0DE7>0031 # SINHALA LITH DIGIT ONE
|
||||||
|
0DE8>0032 # SINHALA LITH DIGIT TWO
|
||||||
|
0DE9>0033 # SINHALA LITH DIGIT THREE
|
||||||
|
0DEA>0034 # SINHALA LITH DIGIT FOUR
|
||||||
|
0DEB>0035 # SINHALA LITH DIGIT FIVE
|
||||||
|
0DEC>0036 # SINHALA LITH DIGIT SIX
|
||||||
|
0DED>0037 # SINHALA LITH DIGIT SEVEN
|
||||||
|
0DEE>0038 # SINHALA LITH DIGIT EIGHT
|
||||||
|
0DEF>0039 # SINHALA LITH DIGIT NINE
|
||||||
0E50>0030 # THAI DIGIT ZERO
|
0E50>0030 # THAI DIGIT ZERO
|
||||||
0E51>0031 # THAI DIGIT ONE
|
0E51>0031 # THAI DIGIT ONE
|
||||||
0E52>0032 # THAI DIGIT TWO
|
0E52>0032 # THAI DIGIT TWO
|
||||||
|
@ -388,6 +398,16 @@ A9D6>0036 # JAVANESE DIGIT SIX
|
||||||
A9D7>0037 # JAVANESE DIGIT SEVEN
|
A9D7>0037 # JAVANESE DIGIT SEVEN
|
||||||
A9D8>0038 # JAVANESE DIGIT EIGHT
|
A9D8>0038 # JAVANESE DIGIT EIGHT
|
||||||
A9D9>0039 # JAVANESE DIGIT NINE
|
A9D9>0039 # JAVANESE DIGIT NINE
|
||||||
|
A9F0>0030 # MYANMAR TAI LAING DIGIT ZERO
|
||||||
|
A9F1>0031 # MYANMAR TAI LAING DIGIT ONE
|
||||||
|
A9F2>0032 # MYANMAR TAI LAING DIGIT TWO
|
||||||
|
A9F3>0033 # MYANMAR TAI LAING DIGIT THREE
|
||||||
|
A9F4>0034 # MYANMAR TAI LAING DIGIT FOUR
|
||||||
|
A9F5>0035 # MYANMAR TAI LAING DIGIT FIVE
|
||||||
|
A9F6>0036 # MYANMAR TAI LAING DIGIT SIX
|
||||||
|
A9F7>0037 # MYANMAR TAI LAING DIGIT SEVEN
|
||||||
|
A9F8>0038 # MYANMAR TAI LAING DIGIT EIGHT
|
||||||
|
A9F9>0039 # MYANMAR TAI LAING DIGIT NINE
|
||||||
AA50>0030 # CHAM DIGIT ZERO
|
AA50>0030 # CHAM DIGIT ZERO
|
||||||
AA51>0031 # CHAM DIGIT ONE
|
AA51>0031 # CHAM DIGIT ONE
|
||||||
AA52>0032 # CHAM DIGIT TWO
|
AA52>0032 # CHAM DIGIT TWO
|
||||||
|
@ -480,6 +500,36 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||||
111D7>0037 # SHARADA DIGIT SEVEN
|
111D7>0037 # SHARADA DIGIT SEVEN
|
||||||
111D8>0038 # SHARADA DIGIT EIGHT
|
111D8>0038 # SHARADA DIGIT EIGHT
|
||||||
111D9>0039 # SHARADA DIGIT NINE
|
111D9>0039 # SHARADA DIGIT NINE
|
||||||
|
112F0>0030 # KHUDAWADI DIGIT ZERO
|
||||||
|
112F1>0031 # KHUDAWADI DIGIT ONE
|
||||||
|
112F2>0032 # KHUDAWADI DIGIT TWO
|
||||||
|
112F3>0033 # KHUDAWADI DIGIT THREE
|
||||||
|
112F4>0034 # KHUDAWADI DIGIT FOUR
|
||||||
|
112F5>0035 # KHUDAWADI DIGIT FIVE
|
||||||
|
112F6>0036 # KHUDAWADI DIGIT SIX
|
||||||
|
112F7>0037 # KHUDAWADI DIGIT SEVEN
|
||||||
|
112F8>0038 # KHUDAWADI DIGIT EIGHT
|
||||||
|
112F9>0039 # KHUDAWADI DIGIT NINE
|
||||||
|
114D0>0030 # TIRHUTA DIGIT ZERO
|
||||||
|
114D1>0031 # TIRHUTA DIGIT ONE
|
||||||
|
114D2>0032 # TIRHUTA DIGIT TWO
|
||||||
|
114D3>0033 # TIRHUTA DIGIT THREE
|
||||||
|
114D4>0034 # TIRHUTA DIGIT FOUR
|
||||||
|
114D5>0035 # TIRHUTA DIGIT FIVE
|
||||||
|
114D6>0036 # TIRHUTA DIGIT SIX
|
||||||
|
114D7>0037 # TIRHUTA DIGIT SEVEN
|
||||||
|
114D8>0038 # TIRHUTA DIGIT EIGHT
|
||||||
|
114D9>0039 # TIRHUTA DIGIT NINE
|
||||||
|
11650>0030 # MODI DIGIT ZERO
|
||||||
|
11651>0031 # MODI DIGIT ONE
|
||||||
|
11652>0032 # MODI DIGIT TWO
|
||||||
|
11653>0033 # MODI DIGIT THREE
|
||||||
|
11654>0034 # MODI DIGIT FOUR
|
||||||
|
11655>0035 # MODI DIGIT FIVE
|
||||||
|
11656>0036 # MODI DIGIT SIX
|
||||||
|
11657>0037 # MODI DIGIT SEVEN
|
||||||
|
11658>0038 # MODI DIGIT EIGHT
|
||||||
|
11659>0039 # MODI DIGIT NINE
|
||||||
116C0>0030 # TAKRI DIGIT ZERO
|
116C0>0030 # TAKRI DIGIT ZERO
|
||||||
116C1>0031 # TAKRI DIGIT ONE
|
116C1>0031 # TAKRI DIGIT ONE
|
||||||
116C2>0032 # TAKRI DIGIT TWO
|
116C2>0032 # TAKRI DIGIT TWO
|
||||||
|
@ -490,4 +540,34 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||||
116C7>0037 # TAKRI DIGIT SEVEN
|
116C7>0037 # TAKRI DIGIT SEVEN
|
||||||
116C8>0038 # TAKRI DIGIT EIGHT
|
116C8>0038 # TAKRI DIGIT EIGHT
|
||||||
116C9>0039 # TAKRI DIGIT NINE
|
116C9>0039 # TAKRI DIGIT NINE
|
||||||
|
118E0>0030 # WARANG CITI DIGIT ZERO
|
||||||
|
118E1>0031 # WARANG CITI DIGIT ONE
|
||||||
|
118E2>0032 # WARANG CITI DIGIT TWO
|
||||||
|
118E3>0033 # WARANG CITI DIGIT THREE
|
||||||
|
118E4>0034 # WARANG CITI DIGIT FOUR
|
||||||
|
118E5>0035 # WARANG CITI DIGIT FIVE
|
||||||
|
118E6>0036 # WARANG CITI DIGIT SIX
|
||||||
|
118E7>0037 # WARANG CITI DIGIT SEVEN
|
||||||
|
118E8>0038 # WARANG CITI DIGIT EIGHT
|
||||||
|
118E9>0039 # WARANG CITI DIGIT NINE
|
||||||
|
16A60>0030 # MRO DIGIT ZERO
|
||||||
|
16A61>0031 # MRO DIGIT ONE
|
||||||
|
16A62>0032 # MRO DIGIT TWO
|
||||||
|
16A63>0033 # MRO DIGIT THREE
|
||||||
|
16A64>0034 # MRO DIGIT FOUR
|
||||||
|
16A65>0035 # MRO DIGIT FIVE
|
||||||
|
16A66>0036 # MRO DIGIT SIX
|
||||||
|
16A67>0037 # MRO DIGIT SEVEN
|
||||||
|
16A68>0038 # MRO DIGIT EIGHT
|
||||||
|
16A69>0039 # MRO DIGIT NINE
|
||||||
|
16B50>0030 # PAHAWH HMONG DIGIT ZERO
|
||||||
|
16B51>0031 # PAHAWH HMONG DIGIT ONE
|
||||||
|
16B52>0032 # PAHAWH HMONG DIGIT TWO
|
||||||
|
16B53>0033 # PAHAWH HMONG DIGIT THREE
|
||||||
|
16B54>0034 # PAHAWH HMONG DIGIT FOUR
|
||||||
|
16B55>0035 # PAHAWH HMONG DIGIT FIVE
|
||||||
|
16B56>0036 # PAHAWH HMONG DIGIT SIX
|
||||||
|
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
|
||||||
|
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
|
||||||
|
16B59>0039 # PAHAWH HMONG DIGIT NINE
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright (C) 1999-2013, International Business Machines
|
# Copyright (C) 1999-2014, International Business Machines
|
||||||
# Corporation and others. All Rights Reserved.
|
# Corporation and others. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# file name: nfc.txt
|
# file name: nfc.txt
|
||||||
|
@ -7,7 +7,7 @@
|
||||||
#
|
#
|
||||||
# Complete data for Unicode NFC normalization.
|
# Complete data for Unicode NFC normalization.
|
||||||
|
|
||||||
* Unicode 6.3.0
|
* Unicode 7.0.0
|
||||||
|
|
||||||
# Canonical_Combining_Class (ccc) values
|
# Canonical_Combining_Class (ccc) values
|
||||||
0300..0314:230
|
0300..0314:230
|
||||||
|
@ -142,7 +142,7 @@
|
||||||
08F6:220
|
08F6:220
|
||||||
08F7..08F8:230
|
08F7..08F8:230
|
||||||
08F9..08FA:220
|
08F9..08FA:220
|
||||||
08FB..08FE:230
|
08FB..08FF:230
|
||||||
093C:7
|
093C:7
|
||||||
094D:9
|
094D:9
|
||||||
0951:230
|
0951:230
|
||||||
|
@ -199,6 +199,10 @@
|
||||||
1A60:9
|
1A60:9
|
||||||
1A75..1A7C:230
|
1A75..1A7C:230
|
||||||
1A7F:220
|
1A7F:220
|
||||||
|
1AB0..1AB4:230
|
||||||
|
1AB5..1ABA:220
|
||||||
|
1ABB..1ABC:230
|
||||||
|
1ABD:220
|
||||||
1B34:7
|
1B34:7
|
||||||
1B44:9
|
1B44:9
|
||||||
1B6B:230
|
1B6B:230
|
||||||
|
@ -217,6 +221,7 @@
|
||||||
1CE2..1CE8:1
|
1CE2..1CE8:1
|
||||||
1CED:220
|
1CED:220
|
||||||
1CF4:230
|
1CF4:230
|
||||||
|
1CF8..1CF9:230
|
||||||
1DC0..1DC1:230
|
1DC0..1DC1:230
|
||||||
1DC2:220
|
1DC2:220
|
||||||
1DC3..1DC9:230
|
1DC3..1DC9:230
|
||||||
|
@ -226,7 +231,7 @@
|
||||||
1DCE:214
|
1DCE:214
|
||||||
1DCF:220
|
1DCF:220
|
||||||
1DD0:202
|
1DD0:202
|
||||||
1DD1..1DE6:230
|
1DD1..1DF5:230
|
||||||
1DFC:233
|
1DFC:233
|
||||||
1DFD:220
|
1DFD:220
|
||||||
1DFE:230
|
1DFE:230
|
||||||
|
@ -274,21 +279,44 @@ AAF6:9
|
||||||
ABED:9
|
ABED:9
|
||||||
FB1E:26
|
FB1E:26
|
||||||
FE20..FE26:230
|
FE20..FE26:230
|
||||||
|
FE27..FE2D:220
|
||||||
101FD:220
|
101FD:220
|
||||||
|
102E0:220
|
||||||
|
10376..1037A:230
|
||||||
10A0D:220
|
10A0D:220
|
||||||
10A0F:230
|
10A0F:230
|
||||||
10A38:230
|
10A38:230
|
||||||
10A39:1
|
10A39:1
|
||||||
10A3A:220
|
10A3A:220
|
||||||
10A3F:9
|
10A3F:9
|
||||||
|
10AE5:230
|
||||||
|
10AE6:220
|
||||||
11046:9
|
11046:9
|
||||||
|
1107F:9
|
||||||
110B9:9
|
110B9:9
|
||||||
110BA:7
|
110BA:7
|
||||||
11100..11102:230
|
11100..11102:230
|
||||||
11133..11134:9
|
11133..11134:9
|
||||||
|
11173:7
|
||||||
111C0:9
|
111C0:9
|
||||||
|
11235:9
|
||||||
|
11236:7
|
||||||
|
112E9:7
|
||||||
|
112EA:9
|
||||||
|
1133C:7
|
||||||
|
1134D:9
|
||||||
|
11366..1136C:230
|
||||||
|
11370..11374:230
|
||||||
|
114C2:9
|
||||||
|
114C3:7
|
||||||
|
115BF:9
|
||||||
|
115C0:7
|
||||||
|
1163F:9
|
||||||
116B6:9
|
116B6:9
|
||||||
116B7:7
|
116B7:7
|
||||||
|
16AF0..16AF4:1
|
||||||
|
16B30..16B36:230
|
||||||
|
1BC9E:1
|
||||||
1D165..1D166:216
|
1D165..1D166:216
|
||||||
1D167..1D169:1
|
1D167..1D169:1
|
||||||
1D16D:226
|
1D16D:226
|
||||||
|
@ -298,6 +326,7 @@ FE20..FE26:230
|
||||||
1D18A..1D18B:220
|
1D18A..1D18B:220
|
||||||
1D1AA..1D1AD:230
|
1D1AA..1D1AD:230
|
||||||
1D242..1D244:230
|
1D242..1D244:230
|
||||||
|
1E8D0..1E8D6:220
|
||||||
|
|
||||||
# Canonical decomposition mappings
|
# Canonical decomposition mappings
|
||||||
00C0>0041 0300 # one-way: diacritic 0300
|
00C0>0041 0300 # one-way: diacritic 0300
|
||||||
|
@ -1798,6 +1827,13 @@ FB4E>05E4 05BF
|
||||||
110AB>110A5 110BA # one-way: diacritic 110BA
|
110AB>110A5 110BA # one-way: diacritic 110BA
|
||||||
1112E=11131 11127
|
1112E=11131 11127
|
||||||
1112F=11132 11127
|
1112F=11132 11127
|
||||||
|
1134B=11347 1133E
|
||||||
|
1134C=11347 11357
|
||||||
|
114BB=114B9 114BA
|
||||||
|
114BC=114B9 114B0
|
||||||
|
114BE=114B9 114BD
|
||||||
|
115BA=115B8 115AF
|
||||||
|
115BB=115B9 115AF
|
||||||
1D15E>1D157 1D165
|
1D15E>1D157 1D165
|
||||||
1D15F>1D158 1D165
|
1D15F>1D158 1D165
|
||||||
1D160>1D15F 1D16E
|
1D160>1D15F 1D16E
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Copyright (C) 1999-2013, International Business Machines
|
# Copyright (C) 1999-2014, International Business Machines
|
||||||
# Corporation and others. All Rights Reserved.
|
# Corporation and others. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# file name: nfkc.txt
|
# file name: nfkc.txt
|
||||||
|
@ -11,7 +11,7 @@
|
||||||
# to NFKC one-way mappings.
|
# to NFKC one-way mappings.
|
||||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||||
|
|
||||||
* Unicode 6.3.0
|
* Unicode 7.0.0
|
||||||
|
|
||||||
00A0>0020
|
00A0>0020
|
||||||
00A8>0020 0308
|
00A8>0020 0308
|
||||||
|
@ -1361,9 +1361,15 @@
|
||||||
33FD>0033 0030 65E5
|
33FD>0033 0030 65E5
|
||||||
33FE>0033 0031 65E5
|
33FE>0033 0031 65E5
|
||||||
33FF>0067 0061 006C
|
33FF>0067 0061 006C
|
||||||
|
A69C>044A
|
||||||
|
A69D>044C
|
||||||
A770>A76F
|
A770>A76F
|
||||||
A7F8>0126
|
A7F8>0126
|
||||||
A7F9>0153
|
A7F9>0153
|
||||||
|
AB5C>A727
|
||||||
|
AB5D>AB37
|
||||||
|
AB5E>026B
|
||||||
|
AB5F>AB52
|
||||||
FB00>0066 0066
|
FB00>0066 0066
|
||||||
FB01>0066 0069
|
FB01>0066 0069
|
||||||
FB02>0066 006C
|
FB02>0066 006C
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Unicode Character Database
|
# Unicode Character Database
|
||||||
# Copyright (c) 1991-2013 Unicode, Inc.
|
# Copyright (c) 1991-2014 Unicode, Inc.
|
||||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||||
#
|
#
|
||||||
|
@ -12,7 +12,7 @@
|
||||||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||||
|
|
||||||
* Unicode 6.3.0
|
* Unicode 7.0.0
|
||||||
|
|
||||||
0041>0061
|
0041>0061
|
||||||
0042>0062
|
0042>0062
|
||||||
|
@ -286,6 +286,7 @@
|
||||||
0376>0377
|
0376>0377
|
||||||
037A>0020 03B9
|
037A>0020 03B9
|
||||||
037E>003B
|
037E>003B
|
||||||
|
037F>03F3
|
||||||
0384>0020 0301
|
0384>0020 0301
|
||||||
0385>0020 0308 0301
|
0385>0020 0308 0301
|
||||||
0386>03AC
|
0386>03AC
|
||||||
|
@ -498,6 +499,10 @@
|
||||||
0522>0523
|
0522>0523
|
||||||
0524>0525
|
0524>0525
|
||||||
0526>0527
|
0526>0527
|
||||||
|
0528>0529
|
||||||
|
052A>052B
|
||||||
|
052C>052D
|
||||||
|
052E>052F
|
||||||
0531>0561
|
0531>0561
|
||||||
0532>0562
|
0532>0562
|
||||||
0533>0563
|
0533>0563
|
||||||
|
@ -2308,6 +2313,10 @@ A690>A691
|
||||||
A692>A693
|
A692>A693
|
||||||
A694>A695
|
A694>A695
|
||||||
A696>A697
|
A696>A697
|
||||||
|
A698>A699
|
||||||
|
A69A>A69B
|
||||||
|
A69C>044A
|
||||||
|
A69D>044C
|
||||||
A722>A723
|
A722>A723
|
||||||
A724>A725
|
A724>A725
|
||||||
A726>A727
|
A726>A727
|
||||||
|
@ -2359,14 +2368,28 @@ A78B>A78C
|
||||||
A78D>0265
|
A78D>0265
|
||||||
A790>A791
|
A790>A791
|
||||||
A792>A793
|
A792>A793
|
||||||
|
A796>A797
|
||||||
|
A798>A799
|
||||||
|
A79A>A79B
|
||||||
|
A79C>A79D
|
||||||
|
A79E>A79F
|
||||||
A7A0>A7A1
|
A7A0>A7A1
|
||||||
A7A2>A7A3
|
A7A2>A7A3
|
||||||
A7A4>A7A5
|
A7A4>A7A5
|
||||||
A7A6>A7A7
|
A7A6>A7A7
|
||||||
A7A8>A7A9
|
A7A8>A7A9
|
||||||
A7AA>0266
|
A7AA>0266
|
||||||
|
A7AB>025C
|
||||||
|
A7AC>0261
|
||||||
|
A7AD>026C
|
||||||
|
A7B0>029E
|
||||||
|
A7B1>0287
|
||||||
A7F8>0127
|
A7F8>0127
|
||||||
A7F9>0153
|
A7F9>0153
|
||||||
|
AB5C>A727
|
||||||
|
AB5D>AB37
|
||||||
|
AB5E>026B
|
||||||
|
AB5F>AB52
|
||||||
F900>8C48
|
F900>8C48
|
||||||
F901>66F4
|
F901>66F4
|
||||||
F902>8ECA
|
F902>8ECA
|
||||||
|
@ -3743,6 +3766,39 @@ FFF0..FFF8>
|
||||||
10425>1044D
|
10425>1044D
|
||||||
10426>1044E
|
10426>1044E
|
||||||
10427>1044F
|
10427>1044F
|
||||||
|
118A0>118C0
|
||||||
|
118A1>118C1
|
||||||
|
118A2>118C2
|
||||||
|
118A3>118C3
|
||||||
|
118A4>118C4
|
||||||
|
118A5>118C5
|
||||||
|
118A6>118C6
|
||||||
|
118A7>118C7
|
||||||
|
118A8>118C8
|
||||||
|
118A9>118C9
|
||||||
|
118AA>118CA
|
||||||
|
118AB>118CB
|
||||||
|
118AC>118CC
|
||||||
|
118AD>118CD
|
||||||
|
118AE>118CE
|
||||||
|
118AF>118CF
|
||||||
|
118B0>118D0
|
||||||
|
118B1>118D1
|
||||||
|
118B2>118D2
|
||||||
|
118B3>118D3
|
||||||
|
118B4>118D4
|
||||||
|
118B5>118D5
|
||||||
|
118B6>118D6
|
||||||
|
118B7>118D7
|
||||||
|
118B8>118D8
|
||||||
|
118B9>118D9
|
||||||
|
118BA>118DA
|
||||||
|
118BB>118DB
|
||||||
|
118BC>118DC
|
||||||
|
118BD>118DD
|
||||||
|
118BE>118DE
|
||||||
|
118BF>118DF
|
||||||
|
1BCA0..1BCA3>
|
||||||
1D15E>1D157 1D165
|
1D15E>1D157 1D165
|
||||||
1D15F>1D158 1D165
|
1D15F>1D158 1D165
|
||||||
1D160>1D158 1D165 1D16E
|
1D160>1D158 1D165 1D16E
|
||||||
|
|
|
@ -35,8 +35,8 @@ import com.ibm.icu.util.ULocale;
|
||||||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||||
* but with the following tailorings:
|
* but with the following tailorings:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
|
* <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary.
|
||||||
* <li>Myanmar, and Khmer text is broken into syllables
|
* <li>Khmer text is broken into syllables
|
||||||
* based on custom BreakIterator rules.
|
* based on custom BreakIterator rules.
|
||||||
* </ul>
|
* </ul>
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
|
@ -67,8 +67,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
readBreakIterator("Default.brk");
|
readBreakIterator("Default.brk");
|
||||||
private static final BreakIterator khmerBreakIterator =
|
private static final BreakIterator khmerBreakIterator =
|
||||||
readBreakIterator("Khmer.brk");
|
readBreakIterator("Khmer.brk");
|
||||||
private static final BreakIterator myanmarBreakIterator =
|
|
||||||
readBreakIterator("Myanmar.brk");
|
|
||||||
|
|
||||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||||
private final boolean cjkAsWords;
|
private final boolean cjkAsWords;
|
||||||
|
@ -94,7 +92,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
public BreakIterator getBreakIterator(int script) {
|
public BreakIterator getBreakIterator(int script) {
|
||||||
switch(script) {
|
switch(script) {
|
||||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
||||||
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
|
|
||||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||||
}
|
}
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -122,6 +122,10 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMyanmar() throws Exception {
|
||||||
|
assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်ဝင်", "လှုပ်ရှား", "စေ", "ပြီး" });
|
||||||
|
}
|
||||||
|
|
||||||
public void testThai() throws Exception {
|
public void testThai() throws Exception {
|
||||||
assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
|
assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
|
||||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});
|
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});
|
||||||
|
|
|
@ -63,7 +63,7 @@ import java.util.regex.Pattern;
|
||||||
public class GenerateUTR30DataFiles {
|
public class GenerateUTR30DataFiles {
|
||||||
private static final String ICU_SVN_TAG_URL
|
private static final String ICU_SVN_TAG_URL
|
||||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||||
private static final String ICU_RELEASE_TAG = "release-52-1";
|
private static final String ICU_RELEASE_TAG = "release-54-1";
|
||||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||||
private static final String NFC_TXT = "nfc.txt";
|
private static final String NFC_TXT = "nfc.txt";
|
||||||
private static final String NFKC_TXT = "nfkc.txt";
|
private static final String NFKC_TXT = "nfkc.txt";
|
||||||
|
|
|
@ -35,7 +35,7 @@ com.google.inject.guice.version = 3.0
|
||||||
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
|
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
|
||||||
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
||||||
/com.googlecode.mp4parser/isoparser = 1.0.2
|
/com.googlecode.mp4parser/isoparser = 1.0.2
|
||||||
/com.ibm.icu/icu4j = 53.1
|
/com.ibm.icu/icu4j = 54.1
|
||||||
/com.pff/java-libpst = 0.8.1
|
/com.pff/java-libpst = 0.8.1
|
||||||
/com.spatial4j/spatial4j = 0.4.1
|
/com.spatial4j/spatial4j = 0.4.1
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
786d9055d4ca8c1aab4a7d4ac8283f973fd7e41f
|
|
|
@ -0,0 +1 @@
|
||||||
|
3f66ecd5871467598bc81662817b80612a0a907f
|
|
@ -1 +0,0 @@
|
||||||
786d9055d4ca8c1aab4a7d4ac8283f973fd7e41f
|
|
|
@ -0,0 +1 @@
|
||||||
|
3f66ecd5871467598bc81662817b80612a0a907f
|
Loading…
Reference in New Issue