mirror of https://github.com/apache/lucene.git
LUCENE-5995: upgrade to ICU 54.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629799 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
f6cbe636bb
|
@ -221,6 +221,8 @@ Build
|
|||
* LUCENE-5962: Rename diffSources.py to createPatch.py and make it work with all text file types.
|
||||
(Ryan Ernst)
|
||||
|
||||
* LUCENE-5995: Upgrade ICU to 54.1 (Robert Muir)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-5563: Removed sep layout: which has fallen behind on features and doesn't
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# Parses Myanmar text, with syllable as token.
|
||||
#
|
||||
|
||||
$Cons = [[:Other_Letter:]&[:Myanmar:]];
|
||||
$Virama = [\u1039];
|
||||
$Asat = [\u103A];
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
$ConsEx = $Cons ($Extend | $Format)*;
|
||||
$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
|
||||
$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
|
||||
$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
|
||||
|
||||
!!forward;
|
||||
$MyanmarJoinedSyllableEx {200};
|
||||
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -49,6 +49,7 @@
|
|||
2E17>002D
|
||||
2E1A>002D
|
||||
2E3A..2E3B>002D
|
||||
2E40>002D
|
||||
301C>002D
|
||||
3030>002D
|
||||
30A0>002D
|
||||
|
|
|
@ -102,6 +102,7 @@
|
|||
1939..193B>
|
||||
1A75..1A7C>
|
||||
1A7F>
|
||||
1AB0..1ABD>
|
||||
1B34>
|
||||
1B44>
|
||||
1B6B..1B73>
|
||||
|
@ -111,8 +112,10 @@
|
|||
1CD0..1CE8>
|
||||
1CED>
|
||||
1CF4>
|
||||
1CF8..1CF9>
|
||||
1D2C..1D6A>
|
||||
1DC4..1DCF>
|
||||
1DF5>
|
||||
1DFD..1DFF>
|
||||
1FBD>
|
||||
1FBF..1FC1>
|
||||
|
@ -128,6 +131,7 @@
|
|||
A66F>
|
||||
A67C..A67D>
|
||||
A67F>
|
||||
A69C..A69D>
|
||||
A6F0..A6F1>
|
||||
A717..A721>
|
||||
A788>
|
||||
|
@ -138,27 +142,43 @@ A92B..A92E>
|
|||
A953>
|
||||
A9B3>
|
||||
A9C0>
|
||||
AA7B>
|
||||
A9E5>
|
||||
AA7B..AA7D>
|
||||
AABF..AAC2>
|
||||
AAF6>
|
||||
AB5B..AB5F>
|
||||
ABEC..ABED>
|
||||
FB1E>
|
||||
FE20..FE26>
|
||||
FE20..FE2D>
|
||||
FF3E>
|
||||
FF40>
|
||||
FF70>
|
||||
FF9E..FF9F>
|
||||
FFE3>
|
||||
102E0>
|
||||
10AE5..10AE6>
|
||||
110B9..110BA>
|
||||
11133..11134>
|
||||
11173>
|
||||
111C0>
|
||||
11235..11236>
|
||||
112E9..112EA>
|
||||
1133C>
|
||||
1134D>
|
||||
11366..1136C>
|
||||
11370..11374>
|
||||
114C2..114C3>
|
||||
115BF..115C0>
|
||||
1163F>
|
||||
116B6..116B7>
|
||||
16AF0..16AF4>
|
||||
16F8F..16F9F>
|
||||
1D167..1D169>
|
||||
1D16D..1D172>
|
||||
1D17B..1D182>
|
||||
1D185..1D18B>
|
||||
1D1AA..1D1AD>
|
||||
1E8D0..1E8D6>
|
||||
|
||||
# Latin script "composed" that do not further decompose, so decompose here
|
||||
# These are from AsciiFoldingFilter
|
||||
|
|
|
@ -151,6 +151,16 @@
|
|||
0D6D>0037 # MALAYALAM DIGIT SEVEN
|
||||
0D6E>0038 # MALAYALAM DIGIT EIGHT
|
||||
0D6F>0039 # MALAYALAM DIGIT NINE
|
||||
0DE6>0030 # SINHALA LITH DIGIT ZERO
|
||||
0DE7>0031 # SINHALA LITH DIGIT ONE
|
||||
0DE8>0032 # SINHALA LITH DIGIT TWO
|
||||
0DE9>0033 # SINHALA LITH DIGIT THREE
|
||||
0DEA>0034 # SINHALA LITH DIGIT FOUR
|
||||
0DEB>0035 # SINHALA LITH DIGIT FIVE
|
||||
0DEC>0036 # SINHALA LITH DIGIT SIX
|
||||
0DED>0037 # SINHALA LITH DIGIT SEVEN
|
||||
0DEE>0038 # SINHALA LITH DIGIT EIGHT
|
||||
0DEF>0039 # SINHALA LITH DIGIT NINE
|
||||
0E50>0030 # THAI DIGIT ZERO
|
||||
0E51>0031 # THAI DIGIT ONE
|
||||
0E52>0032 # THAI DIGIT TWO
|
||||
|
@ -388,6 +398,16 @@ A9D6>0036 # JAVANESE DIGIT SIX
|
|||
A9D7>0037 # JAVANESE DIGIT SEVEN
|
||||
A9D8>0038 # JAVANESE DIGIT EIGHT
|
||||
A9D9>0039 # JAVANESE DIGIT NINE
|
||||
A9F0>0030 # MYANMAR TAI LAING DIGIT ZERO
|
||||
A9F1>0031 # MYANMAR TAI LAING DIGIT ONE
|
||||
A9F2>0032 # MYANMAR TAI LAING DIGIT TWO
|
||||
A9F3>0033 # MYANMAR TAI LAING DIGIT THREE
|
||||
A9F4>0034 # MYANMAR TAI LAING DIGIT FOUR
|
||||
A9F5>0035 # MYANMAR TAI LAING DIGIT FIVE
|
||||
A9F6>0036 # MYANMAR TAI LAING DIGIT SIX
|
||||
A9F7>0037 # MYANMAR TAI LAING DIGIT SEVEN
|
||||
A9F8>0038 # MYANMAR TAI LAING DIGIT EIGHT
|
||||
A9F9>0039 # MYANMAR TAI LAING DIGIT NINE
|
||||
AA50>0030 # CHAM DIGIT ZERO
|
||||
AA51>0031 # CHAM DIGIT ONE
|
||||
AA52>0032 # CHAM DIGIT TWO
|
||||
|
@ -480,6 +500,36 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
111D7>0037 # SHARADA DIGIT SEVEN
|
||||
111D8>0038 # SHARADA DIGIT EIGHT
|
||||
111D9>0039 # SHARADA DIGIT NINE
|
||||
112F0>0030 # KHUDAWADI DIGIT ZERO
|
||||
112F1>0031 # KHUDAWADI DIGIT ONE
|
||||
112F2>0032 # KHUDAWADI DIGIT TWO
|
||||
112F3>0033 # KHUDAWADI DIGIT THREE
|
||||
112F4>0034 # KHUDAWADI DIGIT FOUR
|
||||
112F5>0035 # KHUDAWADI DIGIT FIVE
|
||||
112F6>0036 # KHUDAWADI DIGIT SIX
|
||||
112F7>0037 # KHUDAWADI DIGIT SEVEN
|
||||
112F8>0038 # KHUDAWADI DIGIT EIGHT
|
||||
112F9>0039 # KHUDAWADI DIGIT NINE
|
||||
114D0>0030 # TIRHUTA DIGIT ZERO
|
||||
114D1>0031 # TIRHUTA DIGIT ONE
|
||||
114D2>0032 # TIRHUTA DIGIT TWO
|
||||
114D3>0033 # TIRHUTA DIGIT THREE
|
||||
114D4>0034 # TIRHUTA DIGIT FOUR
|
||||
114D5>0035 # TIRHUTA DIGIT FIVE
|
||||
114D6>0036 # TIRHUTA DIGIT SIX
|
||||
114D7>0037 # TIRHUTA DIGIT SEVEN
|
||||
114D8>0038 # TIRHUTA DIGIT EIGHT
|
||||
114D9>0039 # TIRHUTA DIGIT NINE
|
||||
11650>0030 # MODI DIGIT ZERO
|
||||
11651>0031 # MODI DIGIT ONE
|
||||
11652>0032 # MODI DIGIT TWO
|
||||
11653>0033 # MODI DIGIT THREE
|
||||
11654>0034 # MODI DIGIT FOUR
|
||||
11655>0035 # MODI DIGIT FIVE
|
||||
11656>0036 # MODI DIGIT SIX
|
||||
11657>0037 # MODI DIGIT SEVEN
|
||||
11658>0038 # MODI DIGIT EIGHT
|
||||
11659>0039 # MODI DIGIT NINE
|
||||
116C0>0030 # TAKRI DIGIT ZERO
|
||||
116C1>0031 # TAKRI DIGIT ONE
|
||||
116C2>0032 # TAKRI DIGIT TWO
|
||||
|
@ -490,4 +540,34 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
116C7>0037 # TAKRI DIGIT SEVEN
|
||||
116C8>0038 # TAKRI DIGIT EIGHT
|
||||
116C9>0039 # TAKRI DIGIT NINE
|
||||
118E0>0030 # WARANG CITI DIGIT ZERO
|
||||
118E1>0031 # WARANG CITI DIGIT ONE
|
||||
118E2>0032 # WARANG CITI DIGIT TWO
|
||||
118E3>0033 # WARANG CITI DIGIT THREE
|
||||
118E4>0034 # WARANG CITI DIGIT FOUR
|
||||
118E5>0035 # WARANG CITI DIGIT FIVE
|
||||
118E6>0036 # WARANG CITI DIGIT SIX
|
||||
118E7>0037 # WARANG CITI DIGIT SEVEN
|
||||
118E8>0038 # WARANG CITI DIGIT EIGHT
|
||||
118E9>0039 # WARANG CITI DIGIT NINE
|
||||
16A60>0030 # MRO DIGIT ZERO
|
||||
16A61>0031 # MRO DIGIT ONE
|
||||
16A62>0032 # MRO DIGIT TWO
|
||||
16A63>0033 # MRO DIGIT THREE
|
||||
16A64>0034 # MRO DIGIT FOUR
|
||||
16A65>0035 # MRO DIGIT FIVE
|
||||
16A66>0036 # MRO DIGIT SIX
|
||||
16A67>0037 # MRO DIGIT SEVEN
|
||||
16A68>0038 # MRO DIGIT EIGHT
|
||||
16A69>0039 # MRO DIGIT NINE
|
||||
16B50>0030 # PAHAWH HMONG DIGIT ZERO
|
||||
16B51>0031 # PAHAWH HMONG DIGIT ONE
|
||||
16B52>0032 # PAHAWH HMONG DIGIT TWO
|
||||
16B53>0033 # PAHAWH HMONG DIGIT THREE
|
||||
16B54>0034 # PAHAWH HMONG DIGIT FOUR
|
||||
16B55>0035 # PAHAWH HMONG DIGIT FIVE
|
||||
16B56>0036 # PAHAWH HMONG DIGIT SIX
|
||||
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
|
||||
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
|
||||
16B59>0039 # PAHAWH HMONG DIGIT NINE
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 1999-2013, International Business Machines
|
||||
# Copyright (C) 1999-2014, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfc.txt
|
||||
|
@ -7,7 +7,7 @@
|
|||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode 6.3.0
|
||||
* Unicode 7.0.0
|
||||
|
||||
# Canonical_Combining_Class (ccc) values
|
||||
0300..0314:230
|
||||
|
@ -142,7 +142,7 @@
|
|||
08F6:220
|
||||
08F7..08F8:230
|
||||
08F9..08FA:220
|
||||
08FB..08FE:230
|
||||
08FB..08FF:230
|
||||
093C:7
|
||||
094D:9
|
||||
0951:230
|
||||
|
@ -199,6 +199,10 @@
|
|||
1A60:9
|
||||
1A75..1A7C:230
|
||||
1A7F:220
|
||||
1AB0..1AB4:230
|
||||
1AB5..1ABA:220
|
||||
1ABB..1ABC:230
|
||||
1ABD:220
|
||||
1B34:7
|
||||
1B44:9
|
||||
1B6B:230
|
||||
|
@ -217,6 +221,7 @@
|
|||
1CE2..1CE8:1
|
||||
1CED:220
|
||||
1CF4:230
|
||||
1CF8..1CF9:230
|
||||
1DC0..1DC1:230
|
||||
1DC2:220
|
||||
1DC3..1DC9:230
|
||||
|
@ -226,7 +231,7 @@
|
|||
1DCE:214
|
||||
1DCF:220
|
||||
1DD0:202
|
||||
1DD1..1DE6:230
|
||||
1DD1..1DF5:230
|
||||
1DFC:233
|
||||
1DFD:220
|
||||
1DFE:230
|
||||
|
@ -274,21 +279,44 @@ AAF6:9
|
|||
ABED:9
|
||||
FB1E:26
|
||||
FE20..FE26:230
|
||||
FE27..FE2D:220
|
||||
101FD:220
|
||||
102E0:220
|
||||
10376..1037A:230
|
||||
10A0D:220
|
||||
10A0F:230
|
||||
10A38:230
|
||||
10A39:1
|
||||
10A3A:220
|
||||
10A3F:9
|
||||
10AE5:230
|
||||
10AE6:220
|
||||
11046:9
|
||||
1107F:9
|
||||
110B9:9
|
||||
110BA:7
|
||||
11100..11102:230
|
||||
11133..11134:9
|
||||
11173:7
|
||||
111C0:9
|
||||
11235:9
|
||||
11236:7
|
||||
112E9:7
|
||||
112EA:9
|
||||
1133C:7
|
||||
1134D:9
|
||||
11366..1136C:230
|
||||
11370..11374:230
|
||||
114C2:9
|
||||
114C3:7
|
||||
115BF:9
|
||||
115C0:7
|
||||
1163F:9
|
||||
116B6:9
|
||||
116B7:7
|
||||
16AF0..16AF4:1
|
||||
16B30..16B36:230
|
||||
1BC9E:1
|
||||
1D165..1D166:216
|
||||
1D167..1D169:1
|
||||
1D16D:226
|
||||
|
@ -298,6 +326,7 @@ FE20..FE26:230
|
|||
1D18A..1D18B:220
|
||||
1D1AA..1D1AD:230
|
||||
1D242..1D244:230
|
||||
1E8D0..1E8D6:220
|
||||
|
||||
# Canonical decomposition mappings
|
||||
00C0>0041 0300 # one-way: diacritic 0300
|
||||
|
@ -1798,6 +1827,13 @@ FB4E>05E4 05BF
|
|||
110AB>110A5 110BA # one-way: diacritic 110BA
|
||||
1112E=11131 11127
|
||||
1112F=11132 11127
|
||||
1134B=11347 1133E
|
||||
1134C=11347 11357
|
||||
114BB=114B9 114BA
|
||||
114BC=114B9 114B0
|
||||
114BE=114B9 114BD
|
||||
115BA=115B8 115AF
|
||||
115BB=115B9 115AF
|
||||
1D15E>1D157 1D165
|
||||
1D15F>1D158 1D165
|
||||
1D160>1D15F 1D16E
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 1999-2013, International Business Machines
|
||||
# Copyright (C) 1999-2014, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfkc.txt
|
||||
|
@ -11,7 +11,7 @@
|
|||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode 6.3.0
|
||||
* Unicode 7.0.0
|
||||
|
||||
00A0>0020
|
||||
00A8>0020 0308
|
||||
|
@ -1361,9 +1361,15 @@
|
|||
33FD>0033 0030 65E5
|
||||
33FE>0033 0031 65E5
|
||||
33FF>0067 0061 006C
|
||||
A69C>044A
|
||||
A69D>044C
|
||||
A770>A76F
|
||||
A7F8>0126
|
||||
A7F9>0153
|
||||
AB5C>A727
|
||||
AB5D>AB37
|
||||
AB5E>026B
|
||||
AB5F>AB52
|
||||
FB00>0066 0066
|
||||
FB01>0066 0069
|
||||
FB02>0066 006C
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2013 Unicode, Inc.
|
||||
# Copyright (c) 1991-2014 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
|
@ -12,7 +12,7 @@
|
|||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
* Unicode 6.3.0
|
||||
* Unicode 7.0.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
|
@ -286,6 +286,7 @@
|
|||
0376>0377
|
||||
037A>0020 03B9
|
||||
037E>003B
|
||||
037F>03F3
|
||||
0384>0020 0301
|
||||
0385>0020 0308 0301
|
||||
0386>03AC
|
||||
|
@ -498,6 +499,10 @@
|
|||
0522>0523
|
||||
0524>0525
|
||||
0526>0527
|
||||
0528>0529
|
||||
052A>052B
|
||||
052C>052D
|
||||
052E>052F
|
||||
0531>0561
|
||||
0532>0562
|
||||
0533>0563
|
||||
|
@ -2308,6 +2313,10 @@ A690>A691
|
|||
A692>A693
|
||||
A694>A695
|
||||
A696>A697
|
||||
A698>A699
|
||||
A69A>A69B
|
||||
A69C>044A
|
||||
A69D>044C
|
||||
A722>A723
|
||||
A724>A725
|
||||
A726>A727
|
||||
|
@ -2359,14 +2368,28 @@ A78B>A78C
|
|||
A78D>0265
|
||||
A790>A791
|
||||
A792>A793
|
||||
A796>A797
|
||||
A798>A799
|
||||
A79A>A79B
|
||||
A79C>A79D
|
||||
A79E>A79F
|
||||
A7A0>A7A1
|
||||
A7A2>A7A3
|
||||
A7A4>A7A5
|
||||
A7A6>A7A7
|
||||
A7A8>A7A9
|
||||
A7AA>0266
|
||||
A7AB>025C
|
||||
A7AC>0261
|
||||
A7AD>026C
|
||||
A7B0>029E
|
||||
A7B1>0287
|
||||
A7F8>0127
|
||||
A7F9>0153
|
||||
AB5C>A727
|
||||
AB5D>AB37
|
||||
AB5E>026B
|
||||
AB5F>AB52
|
||||
F900>8C48
|
||||
F901>66F4
|
||||
F902>8ECA
|
||||
|
@ -3743,6 +3766,39 @@ FFF0..FFF8>
|
|||
10425>1044D
|
||||
10426>1044E
|
||||
10427>1044F
|
||||
118A0>118C0
|
||||
118A1>118C1
|
||||
118A2>118C2
|
||||
118A3>118C3
|
||||
118A4>118C4
|
||||
118A5>118C5
|
||||
118A6>118C6
|
||||
118A7>118C7
|
||||
118A8>118C8
|
||||
118A9>118C9
|
||||
118AA>118CA
|
||||
118AB>118CB
|
||||
118AC>118CC
|
||||
118AD>118CD
|
||||
118AE>118CE
|
||||
118AF>118CF
|
||||
118B0>118D0
|
||||
118B1>118D1
|
||||
118B2>118D2
|
||||
118B3>118D3
|
||||
118B4>118D4
|
||||
118B5>118D5
|
||||
118B6>118D6
|
||||
118B7>118D7
|
||||
118B8>118D8
|
||||
118B9>118D9
|
||||
118BA>118DA
|
||||
118BB>118DB
|
||||
118BC>118DC
|
||||
118BD>118DD
|
||||
118BE>118DE
|
||||
118BF>118DF
|
||||
1BCA0..1BCA3>
|
||||
1D15E>1D157 1D165
|
||||
1D15F>1D158 1D165
|
||||
1D160>1D158 1D165 1D16E
|
||||
|
|
|
@ -35,8 +35,8 @@ import com.ibm.icu.util.ULocale;
|
|||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||
* but with the following tailorings:
|
||||
* <ul>
|
||||
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
|
||||
* <li>Myanmar, and Khmer text is broken into syllables
|
||||
* <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary.
|
||||
* <li>Khmer text is broken into syllables
|
||||
* based on custom BreakIterator rules.
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
|
@ -67,8 +67,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
readBreakIterator("Default.brk");
|
||||
private static final BreakIterator khmerBreakIterator =
|
||||
readBreakIterator("Khmer.brk");
|
||||
private static final BreakIterator myanmarBreakIterator =
|
||||
readBreakIterator("Myanmar.brk");
|
||||
|
||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||
private final boolean cjkAsWords;
|
||||
|
@ -94,7 +92,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
public BreakIterator getBreakIterator(int script) {
|
||||
switch(script) {
|
||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
||||
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
|
||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -122,6 +122,10 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testMyanmar() throws Exception {
|
||||
assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်ဝင်", "လှုပ်ရှား", "စေ", "ပြီး" });
|
||||
}
|
||||
|
||||
public void testThai() throws Exception {
|
||||
assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});
|
||||
|
|
|
@ -63,7 +63,7 @@ import java.util.regex.Pattern;
|
|||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_SVN_TAG_URL
|
||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-52-1";
|
||||
private static final String ICU_RELEASE_TAG = "release-54-1";
|
||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
|
|
|
@ -35,7 +35,7 @@ com.google.inject.guice.version = 3.0
|
|||
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
|
||||
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
||||
/com.googlecode.mp4parser/isoparser = 1.0.2
|
||||
/com.ibm.icu/icu4j = 53.1
|
||||
/com.ibm.icu/icu4j = 54.1
|
||||
/com.pff/java-libpst = 0.8.1
|
||||
/com.spatial4j/spatial4j = 0.4.1
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
786d9055d4ca8c1aab4a7d4ac8283f973fd7e41f
|
|
@ -0,0 +1 @@
|
|||
3f66ecd5871467598bc81662817b80612a0a907f
|
|
@ -1 +0,0 @@
|
|||
786d9055d4ca8c1aab4a7d4ac8283f973fd7e41f
|
|
@ -0,0 +1 @@
|
|||
3f66ecd5871467598bc81662817b80612a0a907f
|
Loading…
Reference in New Issue