LUCENE-5995: upgrade to ICU 54.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629799 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-10-07 03:25:32 +00:00
commit f6cbe636bb
20 changed files with 221 additions and 69 deletions

View File

@ -221,6 +221,8 @@ Build
* LUCENE-5962: Rename diffSources.py to createPatch.py and make it work with all text file types.
(Ryan Ernst)
* LUCENE-5995: Upgrade ICU to 54.1 (Robert Muir)
Other
* LUCENE-5563: Removed sep layout: which has fallen behind on features and doesn't

View File

@ -1,50 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# Parses Myanmar text, with syllable as token.
#
$Cons = [[:Other_Letter:]&[:Myanmar:]];
$Virama = [\u1039];
$Asat = [\u103A];
$WordJoin = [:Line_Break=Word_Joiner:];
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$ConsEx = $Cons ($Extend | $Format)*;
$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
!!forward;
$MyanmarJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -49,6 +49,7 @@
2E17>002D
2E1A>002D
2E3A..2E3B>002D
2E40>002D
301C>002D
3030>002D
30A0>002D

View File

@ -102,6 +102,7 @@
1939..193B>
1A75..1A7C>
1A7F>
1AB0..1ABD>
1B34>
1B44>
1B6B..1B73>
@ -111,8 +112,10 @@
1CD0..1CE8>
1CED>
1CF4>
1CF8..1CF9>
1D2C..1D6A>
1DC4..1DCF>
1DF5>
1DFD..1DFF>
1FBD>
1FBF..1FC1>
@ -128,6 +131,7 @@
A66F>
A67C..A67D>
A67F>
A69C..A69D>
A6F0..A6F1>
A717..A721>
A788>
@ -138,27 +142,43 @@ A92B..A92E>
A953>
A9B3>
A9C0>
AA7B>
A9E5>
AA7B..AA7D>
AABF..AAC2>
AAF6>
AB5B..AB5F>
ABEC..ABED>
FB1E>
FE20..FE26>
FE20..FE2D>
FF3E>
FF40>
FF70>
FF9E..FF9F>
FFE3>
102E0>
10AE5..10AE6>
110B9..110BA>
11133..11134>
11173>
111C0>
11235..11236>
112E9..112EA>
1133C>
1134D>
11366..1136C>
11370..11374>
114C2..114C3>
115BF..115C0>
1163F>
116B6..116B7>
16AF0..16AF4>
16F8F..16F9F>
1D167..1D169>
1D16D..1D172>
1D17B..1D182>
1D185..1D18B>
1D1AA..1D1AD>
1E8D0..1E8D6>
# Latin script "composed" that do not further decompose, so decompose here
# These are from AsciiFoldingFilter

View File

@ -151,6 +151,16 @@
0D6D>0037 # MALAYALAM DIGIT SEVEN
0D6E>0038 # MALAYALAM DIGIT EIGHT
0D6F>0039 # MALAYALAM DIGIT NINE
0DE6>0030 # SINHALA LITH DIGIT ZERO
0DE7>0031 # SINHALA LITH DIGIT ONE
0DE8>0032 # SINHALA LITH DIGIT TWO
0DE9>0033 # SINHALA LITH DIGIT THREE
0DEA>0034 # SINHALA LITH DIGIT FOUR
0DEB>0035 # SINHALA LITH DIGIT FIVE
0DEC>0036 # SINHALA LITH DIGIT SIX
0DED>0037 # SINHALA LITH DIGIT SEVEN
0DEE>0038 # SINHALA LITH DIGIT EIGHT
0DEF>0039 # SINHALA LITH DIGIT NINE
0E50>0030 # THAI DIGIT ZERO
0E51>0031 # THAI DIGIT ONE
0E52>0032 # THAI DIGIT TWO
@ -388,6 +398,16 @@ A9D6>0036 # JAVANESE DIGIT SIX
A9D7>0037 # JAVANESE DIGIT SEVEN
A9D8>0038 # JAVANESE DIGIT EIGHT
A9D9>0039 # JAVANESE DIGIT NINE
A9F0>0030 # MYANMAR TAI LAING DIGIT ZERO
A9F1>0031 # MYANMAR TAI LAING DIGIT ONE
A9F2>0032 # MYANMAR TAI LAING DIGIT TWO
A9F3>0033 # MYANMAR TAI LAING DIGIT THREE
A9F4>0034 # MYANMAR TAI LAING DIGIT FOUR
A9F5>0035 # MYANMAR TAI LAING DIGIT FIVE
A9F6>0036 # MYANMAR TAI LAING DIGIT SIX
A9F7>0037 # MYANMAR TAI LAING DIGIT SEVEN
A9F8>0038 # MYANMAR TAI LAING DIGIT EIGHT
A9F9>0039 # MYANMAR TAI LAING DIGIT NINE
AA50>0030 # CHAM DIGIT ZERO
AA51>0031 # CHAM DIGIT ONE
AA52>0032 # CHAM DIGIT TWO
@ -480,6 +500,36 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
111D7>0037 # SHARADA DIGIT SEVEN
111D8>0038 # SHARADA DIGIT EIGHT
111D9>0039 # SHARADA DIGIT NINE
112F0>0030 # KHUDAWADI DIGIT ZERO
112F1>0031 # KHUDAWADI DIGIT ONE
112F2>0032 # KHUDAWADI DIGIT TWO
112F3>0033 # KHUDAWADI DIGIT THREE
112F4>0034 # KHUDAWADI DIGIT FOUR
112F5>0035 # KHUDAWADI DIGIT FIVE
112F6>0036 # KHUDAWADI DIGIT SIX
112F7>0037 # KHUDAWADI DIGIT SEVEN
112F8>0038 # KHUDAWADI DIGIT EIGHT
112F9>0039 # KHUDAWADI DIGIT NINE
114D0>0030 # TIRHUTA DIGIT ZERO
114D1>0031 # TIRHUTA DIGIT ONE
114D2>0032 # TIRHUTA DIGIT TWO
114D3>0033 # TIRHUTA DIGIT THREE
114D4>0034 # TIRHUTA DIGIT FOUR
114D5>0035 # TIRHUTA DIGIT FIVE
114D6>0036 # TIRHUTA DIGIT SIX
114D7>0037 # TIRHUTA DIGIT SEVEN
114D8>0038 # TIRHUTA DIGIT EIGHT
114D9>0039 # TIRHUTA DIGIT NINE
11650>0030 # MODI DIGIT ZERO
11651>0031 # MODI DIGIT ONE
11652>0032 # MODI DIGIT TWO
11653>0033 # MODI DIGIT THREE
11654>0034 # MODI DIGIT FOUR
11655>0035 # MODI DIGIT FIVE
11656>0036 # MODI DIGIT SIX
11657>0037 # MODI DIGIT SEVEN
11658>0038 # MODI DIGIT EIGHT
11659>0039 # MODI DIGIT NINE
116C0>0030 # TAKRI DIGIT ZERO
116C1>0031 # TAKRI DIGIT ONE
116C2>0032 # TAKRI DIGIT TWO
@ -490,4 +540,34 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
116C7>0037 # TAKRI DIGIT SEVEN
116C8>0038 # TAKRI DIGIT EIGHT
116C9>0039 # TAKRI DIGIT NINE
118E0>0030 # WARANG CITI DIGIT ZERO
118E1>0031 # WARANG CITI DIGIT ONE
118E2>0032 # WARANG CITI DIGIT TWO
118E3>0033 # WARANG CITI DIGIT THREE
118E4>0034 # WARANG CITI DIGIT FOUR
118E5>0035 # WARANG CITI DIGIT FIVE
118E6>0036 # WARANG CITI DIGIT SIX
118E7>0037 # WARANG CITI DIGIT SEVEN
118E8>0038 # WARANG CITI DIGIT EIGHT
118E9>0039 # WARANG CITI DIGIT NINE
16A60>0030 # MRO DIGIT ZERO
16A61>0031 # MRO DIGIT ONE
16A62>0032 # MRO DIGIT TWO
16A63>0033 # MRO DIGIT THREE
16A64>0034 # MRO DIGIT FOUR
16A65>0035 # MRO DIGIT FIVE
16A66>0036 # MRO DIGIT SIX
16A67>0037 # MRO DIGIT SEVEN
16A68>0038 # MRO DIGIT EIGHT
16A69>0039 # MRO DIGIT NINE
16B50>0030 # PAHAWH HMONG DIGIT ZERO
16B51>0031 # PAHAWH HMONG DIGIT ONE
16B52>0032 # PAHAWH HMONG DIGIT TWO
16B53>0033 # PAHAWH HMONG DIGIT THREE
16B54>0034 # PAHAWH HMONG DIGIT FOUR
16B55>0035 # PAHAWH HMONG DIGIT FIVE
16B56>0036 # PAHAWH HMONG DIGIT SIX
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
16B59>0039 # PAHAWH HMONG DIGIT NINE

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2013, International Business Machines
# Copyright (C) 1999-2014, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfc.txt
@ -7,7 +7,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 6.3.0
* Unicode 7.0.0
# Canonical_Combining_Class (ccc) values
0300..0314:230
@ -142,7 +142,7 @@
08F6:220
08F7..08F8:230
08F9..08FA:220
08FB..08FE:230
08FB..08FF:230
093C:7
094D:9
0951:230
@ -199,6 +199,10 @@
1A60:9
1A75..1A7C:230
1A7F:220
1AB0..1AB4:230
1AB5..1ABA:220
1ABB..1ABC:230
1ABD:220
1B34:7
1B44:9
1B6B:230
@ -217,6 +221,7 @@
1CE2..1CE8:1
1CED:220
1CF4:230
1CF8..1CF9:230
1DC0..1DC1:230
1DC2:220
1DC3..1DC9:230
@ -226,7 +231,7 @@
1DCE:214
1DCF:220
1DD0:202
1DD1..1DE6:230
1DD1..1DF5:230
1DFC:233
1DFD:220
1DFE:230
@ -274,21 +279,44 @@ AAF6:9
ABED:9
FB1E:26
FE20..FE26:230
FE27..FE2D:220
101FD:220
102E0:220
10376..1037A:230
10A0D:220
10A0F:230
10A38:230
10A39:1
10A3A:220
10A3F:9
10AE5:230
10AE6:220
11046:9
1107F:9
110B9:9
110BA:7
11100..11102:230
11133..11134:9
11173:7
111C0:9
11235:9
11236:7
112E9:7
112EA:9
1133C:7
1134D:9
11366..1136C:230
11370..11374:230
114C2:9
114C3:7
115BF:9
115C0:7
1163F:9
116B6:9
116B7:7
16AF0..16AF4:1
16B30..16B36:230
1BC9E:1
1D165..1D166:216
1D167..1D169:1
1D16D:226
@ -298,6 +326,7 @@ FE20..FE26:230
1D18A..1D18B:220
1D1AA..1D1AD:230
1D242..1D244:230
1E8D0..1E8D6:220
# Canonical decomposition mappings
00C0>0041 0300 # one-way: diacritic 0300
@ -1798,6 +1827,13 @@ FB4E>05E4 05BF
110AB>110A5 110BA # one-way: diacritic 110BA
1112E=11131 11127
1112F=11132 11127
1134B=11347 1133E
1134C=11347 11357
114BB=114B9 114BA
114BC=114B9 114B0
114BE=114B9 114BD
115BA=115B8 115AF
115BB=115B9 115AF
1D15E>1D157 1D165
1D15F>1D158 1D165
1D160>1D15F 1D16E

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2013, International Business Machines
# Copyright (C) 1999-2014, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc.txt
@ -11,7 +11,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 6.3.0
* Unicode 7.0.0
00A0>0020
00A8>0020 0308
@ -1361,9 +1361,15 @@
33FD>0033 0030 65E5
33FE>0033 0031 65E5
33FF>0067 0061 006C
A69C>044A
A69D>044C
A770>A76F
A7F8>0126
A7F9>0153
AB5C>A727
AB5D>AB37
AB5E>026B
AB5F>AB52
FB00>0066 0066
FB01>0066 0069
FB02>0066 006C

View File

@ -1,5 +1,5 @@
# Unicode Character Database
# Copyright (c) 1991-2013 Unicode, Inc.
# Copyright (c) 1991-2014 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 6.3.0
* Unicode 7.0.0
0041>0061
0042>0062
@ -286,6 +286,7 @@
0376>0377
037A>0020 03B9
037E>003B
037F>03F3
0384>0020 0301
0385>0020 0308 0301
0386>03AC
@ -498,6 +499,10 @@
0522>0523
0524>0525
0526>0527
0528>0529
052A>052B
052C>052D
052E>052F
0531>0561
0532>0562
0533>0563
@ -2308,6 +2313,10 @@ A690>A691
A692>A693
A694>A695
A696>A697
A698>A699
A69A>A69B
A69C>044A
A69D>044C
A722>A723
A724>A725
A726>A727
@ -2359,14 +2368,28 @@ A78B>A78C
A78D>0265
A790>A791
A792>A793
A796>A797
A798>A799
A79A>A79B
A79C>A79D
A79E>A79F
A7A0>A7A1
A7A2>A7A3
A7A4>A7A5
A7A6>A7A7
A7A8>A7A9
A7AA>0266
A7AB>025C
A7AC>0261
A7AD>026C
A7B0>029E
A7B1>0287
A7F8>0127
A7F9>0153
AB5C>A727
AB5D>AB37
AB5E>026B
AB5F>AB52
F900>8C48
F901>66F4
F902>8ECA
@ -3743,6 +3766,39 @@ FFF0..FFF8>
10425>1044D
10426>1044E
10427>1044F
118A0>118C0
118A1>118C1
118A2>118C2
118A3>118C3
118A4>118C4
118A5>118C5
118A6>118C6
118A7>118C7
118A8>118C8
118A9>118C9
118AA>118CA
118AB>118CB
118AC>118CC
118AD>118CD
118AE>118CE
118AF>118CF
118B0>118D0
118B1>118D1
118B2>118D2
118B3>118D3
118B4>118D4
118B5>118D5
118B6>118D6
118B7>118D7
118B8>118D8
118B9>118D9
118BA>118DA
118BB>118DB
118BC>118DC
118BD>118DD
118BE>118DE
118BF>118DF
1BCA0..1BCA3>
1D15E>1D157 1D165
1D15F>1D158 1D165
1D160>1D158 1D165 1D16E

View File

@ -35,8 +35,8 @@ import com.ibm.icu.util.ULocale;
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings:
* <ul>
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
* <li>Myanmar, and Khmer text is broken into syllables
* <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary.
* <li>Khmer text is broken into syllables
* based on custom BreakIterator rules.
* </ul>
* @lucene.experimental
@ -67,8 +67,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
readBreakIterator("Default.brk");
private static final BreakIterator khmerBreakIterator =
readBreakIterator("Khmer.brk");
private static final BreakIterator myanmarBreakIterator =
readBreakIterator("Myanmar.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
private final boolean cjkAsWords;
@ -94,7 +92,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
public BreakIterator getBreakIterator(int script) {
switch(script) {
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
default: return (BreakIterator)defaultBreakIterator.clone();
}

View File

@ -122,6 +122,10 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
}
public void testMyanmar() throws Exception {
assertAnalyzesTo(a, "သက်ဝင်လှုပ်ရှားစေပြီး", new String[] { "သက်ဝင်", "လှုပ်ရှား", "စေ", "ပြီး" });
}
public void testThai() throws Exception {
assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});

View File

@ -63,7 +63,7 @@ import java.util.regex.Pattern;
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-52-1";
private static final String ICU_RELEASE_TAG = "release-54-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";

View File

@ -35,7 +35,7 @@ com.google.inject.guice.version = 3.0
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
/com.googlecode.mp4parser/isoparser = 1.0.2
/com.ibm.icu/icu4j = 53.1
/com.ibm.icu/icu4j = 54.1
/com.pff/java-libpst = 0.8.1
/com.spatial4j/spatial4j = 0.4.1

View File

@ -1 +0,0 @@
786d9055d4ca8c1aab4a7d4ac8283f973fd7e41f

View File

@ -0,0 +1 @@
3f66ecd5871467598bc81662817b80612a0a907f

View File

@ -1 +0,0 @@
786d9055d4ca8c1aab4a7d4ac8283f973fd7e41f

View File

@ -0,0 +1 @@
3f66ecd5871467598bc81662817b80612a0a907f