LUCENE-9773: upgrade icu to 68.2 (#2372)

Upgrade from icu 62.2 to 68.2, with Unicode 13 support.

Modify GenerateUTR30DataFiles to take the release tag as a program
argument. Gradle populates this automatically, removing a manual step
from regeneration process.
This commit is contained in:
Robert Muir 2021-02-15 14:56:13 -05:00 committed by GitHub
parent ef920388e6
commit dd91f5ca82
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 114 additions and 14 deletions

View File

@ -51,12 +51,16 @@ configure(project(":lucene:analysis:icu")) {
doFirst {
// all these steps must be done sequentially: it's a pipeline resulting in utr30.nrm
def v = getVersion('com.ibm.icu', 'icu4j');
project.javaexec {
main = "org.apache.lucene.analysis.icu.GenerateUTR30DataFiles"
classpath = sourceSets.tools.runtimeClasspath
ignoreExitValue false
workingDir utr30DataDir
args = [
"release-${v.replace(".", "-")}"
]
}
project.exec {

View File

@ -256,6 +256,8 @@ Other
* LUCENE-9627: Remove unused Lucene50FieldInfosFormat codec and small refactor some codecs
to separate reading header/footer from reading content of the file. (Ignacio Vera)
* LUCENE-9773: Upgrade icu to 68.2 (Robert Muir)
======================= Lucene 8.9.0 =======================
API Changes

View File

@ -56,6 +56,7 @@
FE58>002D
FE63>002D
FF0D>002D
10EAD>002D
## Greek letterforms folding (done by kd)

View File

@ -76,6 +76,7 @@
0AFD..0AFF>
0B3C>
0B4D>
0B55>
0BCD>
0C4D>
0CBC>
@ -85,6 +86,7 @@
0DCA>
0E47..0E4C>
0E4E>
0EBA>
0EC8..0ECC>
0F18..0F19>
0F35>
@ -96,9 +98,12 @@
0FC6>
1037>
1039..103A>
1063..1064>
1069..106D>
1087..108D>
108F>
109A..109B>
135D..135F>
17C9..17D3>
17DD>
1939..193B>
@ -135,8 +140,8 @@ A67C..A67D>
A67F>
A69C..A69D>
A6F0..A6F1>
A717..A721>
A788>
A700..A721>
A788..A78A>
A7F8..A7F9>
A8C4>
A8E0..A8F1>
@ -149,6 +154,7 @@ AA7B..AA7D>
AABF..AAC2>
AAF6>
AB5B..AB5F>
AB69..AB6B>
ABEC..ABED>
FB1E>
FE20..FE2F>
@ -180,6 +186,9 @@ FFE3>
116B6..116B7>
1172B>
11839..1183A>
1193D..1193E>
11943>
119E0>
11A34>
11A47>
11A99>
@ -188,12 +197,16 @@ FFE3>
11D44..11D45>
11D97>
16AF0..16AF4>
16B30..16B36>
16F8F..16F9F>
16FF0..16FF1>
1D167..1D169>
1D16D..1D172>
1D17B..1D182>
1D185..1D18B>
1D1AA..1D1AD>
1E130..1E136>
1E2EC..1E2EF>
1E8D0..1E8D6>
1E944..1E946>
1E948..1E94A>

View File

@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
118E7>0037 # WARANG CITI DIGIT SEVEN
118E8>0038 # WARANG CITI DIGIT EIGHT
118E9>0039 # WARANG CITI DIGIT NINE
11950>0030 # DIVES AKURU DIGIT ZERO
11951>0031 # DIVES AKURU DIGIT ONE
11952>0032 # DIVES AKURU DIGIT TWO
11953>0033 # DIVES AKURU DIGIT THREE
11954>0034 # DIVES AKURU DIGIT FOUR
11955>0035 # DIVES AKURU DIGIT FIVE
11956>0036 # DIVES AKURU DIGIT SIX
11957>0037 # DIVES AKURU DIGIT SEVEN
11958>0038 # DIVES AKURU DIGIT EIGHT
11959>0039 # DIVES AKURU DIGIT NINE
11C50>0030 # BHAIKSUKI DIGIT ZERO
11C51>0031 # BHAIKSUKI DIGIT ONE
11C52>0032 # BHAIKSUKI DIGIT TWO
@ -630,6 +640,26 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
16B59>0039 # PAHAWH HMONG DIGIT NINE
1E140>0030 # NYIAKENG PUACHUE HMONG DIGIT ZERO
1E141>0031 # NYIAKENG PUACHUE HMONG DIGIT ONE
1E142>0032 # NYIAKENG PUACHUE HMONG DIGIT TWO
1E143>0033 # NYIAKENG PUACHUE HMONG DIGIT THREE
1E144>0034 # NYIAKENG PUACHUE HMONG DIGIT FOUR
1E145>0035 # NYIAKENG PUACHUE HMONG DIGIT FIVE
1E146>0036 # NYIAKENG PUACHUE HMONG DIGIT SIX
1E147>0037 # NYIAKENG PUACHUE HMONG DIGIT SEVEN
1E148>0038 # NYIAKENG PUACHUE HMONG DIGIT EIGHT
1E149>0039 # NYIAKENG PUACHUE HMONG DIGIT NINE
1E2F0>0030 # WANCHO DIGIT ZERO
1E2F1>0031 # WANCHO DIGIT ONE
1E2F2>0032 # WANCHO DIGIT TWO
1E2F3>0033 # WANCHO DIGIT THREE
1E2F4>0034 # WANCHO DIGIT FOUR
1E2F5>0035 # WANCHO DIGIT FIVE
1E2F6>0036 # WANCHO DIGIT SIX
1E2F7>0037 # WANCHO DIGIT SEVEN
1E2F8>0038 # WANCHO DIGIT EIGHT
1E2F9>0039 # WANCHO DIGIT NINE
1E950>0030 # ADLAM DIGIT ZERO
1E951>0031 # ADLAM DIGIT ONE
1E952>0032 # ADLAM DIGIT TWO

View File

@ -9,7 +9,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 11.0.0
* Unicode 13.0.0
# Canonical_Combining_Class (ccc) values
0300..0314:230
@ -176,6 +176,7 @@
0E3A:9
0E48..0E4B:107
0EB8..0EB9:118
0EBA:9
0EC8..0ECB:122
0F18..0F19:220
0F35:220
@ -211,6 +212,7 @@
1AB5..1ABA:220
1ABB..1ABC:230
1ABD:220
1ABF..1AC0:220
1B34:7
1B44:9
1B6B:230
@ -275,6 +277,7 @@ A674..A67D:230
A69E..A69F:230
A6F0..A6F1:230
A806:9
A82C:9
A8C4:9
A8E0..A8F1:230
A92B..A92D:220
@ -305,6 +308,7 @@ FE2E..FE2F:230
10AE5:230
10AE6:220
10D24..10D27:230
10EAB..10EAC:230
10F46..10F47:220
10F48..10F4A:230
10F4B:220
@ -340,6 +344,9 @@ FE2E..FE2F:230
1172B:9
11839:9
1183A:7
1193D..1193E:9
11943:7
119E0:9
11A34:9
11A47:9
11A99:9
@ -349,6 +356,7 @@ FE2E..FE2F:230
11D97:9
16AF0..16AF4:1
16B30..16B36:230
16FF0..16FF1:6
1BC9E:1
1D165..1D166:216
1D167..1D169:1
@ -364,6 +372,8 @@ FE2E..FE2F:230
1E01B..1E021:230
1E023..1E024:230
1E026..1E02A:230
1E130..1E136:230
1E2EC..1E2EF:230
1E8D0..1E8D6:220
1E944..1E949:230
1E94A:7
@ -1874,6 +1884,7 @@ FB4E>05E4 05BF
114BE=114B9 114BD
115BA=115B8 115AF
115BB=115B9 115AF
11938=11935 11930
1D15E>1D157 1D165
1D15F>1D158 1D165
1D160>1D15F 1D16E

View File

@ -13,7 +13,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 11.0.0
* Unicode 13.0.0
00A0>0020
00A8>0020 0308
@ -1107,6 +1107,7 @@
32FC>30F0
32FD>30F1
32FE>30F2
32FF>4EE4 548C
3300>30A2 30D1 30FC 30C8
3301>30A2 30EB 30D5 30A1
3302>30A2 30F3 30DA 30A2
@ -1372,6 +1373,7 @@ AB5C>A727
AB5D>AB37
AB5E>026B
AB5F>AB52
AB69>028D
FB00>0066 0066
FB01>0066 0069
FB02>0066 006C
@ -3630,6 +3632,7 @@ FFEE>25CB
1F14F>0057 0043
1F16A>004D 0043
1F16B>004D 0044
1F16C>004D 0052
1F190>0044 004A
1F200>307B 304B
1F201>30B3 30B3
@ -3689,3 +3692,13 @@ FFEE>25CB
1F248>3014 6557 3015
1F250>5F97
1F251>53EF
1FBF0>0030
1FBF1>0031
1FBF2>0032
1FBF3>0033
1FBF4>0034
1FBF5>0035
1FBF6>0036
1FBF7>0037
1FBF8>0038
1FBF9>0039

View File

@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 11.0.0
* Unicode 13.0.0
0041>0061
0042>0062
@ -2082,6 +2082,7 @@
32FC>30F0
32FD>30F1
32FE>30F2
32FF>4EE4 548C
3300>30A2 30D1 30FC 30C8
3301>30A2 30EB 30D5 30A1
3302>30A2 30F3 30DA 30A2
@ -2450,12 +2451,23 @@ A7B3>AB53
A7B4>A7B5
A7B6>A7B7
A7B8>A7B9
A7BA>A7BB
A7BC>A7BD
A7BE>A7BF
A7C2>A7C3
A7C4>A794
A7C5>0282
A7C6>1D8E
A7C7>A7C8
A7C9>A7CA
A7F5>A7F6
A7F8>0127
A7F9>0153
AB5C>A727
AB5D>AB37
AB5E>026B
AB5F>AB52
AB69>028D
AB70>13A0
AB71>13A1
AB72>13A2
@ -5319,6 +5331,7 @@ FFF0..FFF8>
1F14F>0077 0063
1F16A>006D 0063
1F16B>006D 0064
1F16C>006D 0072
1F190>0064 006A
1F200>307B 304B
1F201>30B3 30B3
@ -5378,6 +5391,16 @@ FFF0..FFF8>
1F248>3014 6557 3015
1F250>5F97
1F251>53EF
1FBF0>0030
1FBF1>0031
1FBF2>0032
1FBF3>0033
1FBF4>0034
1FBF5>0035
1FBF6>0036
1FBF7>0037
1FBF8>0038
1FBF9>0039
2F800>4E3D
2F801>4E38
2F802>4E41

View File

@ -42,7 +42,7 @@ import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt
* Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt for the specified icu release tag.
*
* <p>ASSUMPTION: This class will be run with current directory set to
* lucene/analysis/icu/src/data/utr30/
@ -56,7 +56,6 @@ import java.util.stream.Collectors;
*/
public class GenerateUTR30DataFiles {
private static final String ICU_GIT_TAG_URL = "https://raw.githubusercontent.com/unicode-org/icu";
private static final String ICU_RELEASE_TAG = "maint/maint-62";
private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
@ -74,7 +73,11 @@ public class GenerateUTR30DataFiles {
public static void main(String args[]) {
try {
getNFKCDataFilesFromIcuProject();
if (args.length != 1) {
throw new IllegalArgumentException(
"usage: " + GenerateUTR30DataFiles.class.getName() + " <releaseTag>");
}
getNFKCDataFilesFromIcuProject(args[0]);
expandRulesInUTR30DataFiles();
} catch (Throwable t) {
t.printStackTrace(System.err);
@ -151,9 +154,9 @@ public class GenerateUTR30DataFiles {
}
}
private static void getNFKCDataFilesFromIcuProject() throws IOException {
private static void getNFKCDataFilesFromIcuProject(String releaseTag) throws IOException {
URL icuTagsURL = new URL(ICU_GIT_TAG_URL + "/");
URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/");
URL icuReleaseTagURL = new URL(icuTagsURL, releaseTag + "/");
URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/");
System.err.print("Downloading " + NFKC_TXT + " ... ");

View File

@ -1 +0,0 @@
9ad0d915018dcbb394678a920d72f606cd1c7214

View File

@ -0,0 +1 @@
76893e6000401ace133a65262254be0ebe556d46

View File

@ -1 +0,0 @@
9ad0d915018dcbb394678a920d72f606cd1c7214

View File

@ -0,0 +1 @@
76893e6000401ace133a65262254be0ebe556d46

View File

@ -23,7 +23,7 @@ com.googlecode.juniversalchardet:juniversalchardet:1.0.3 (1 constraints: 0605f33
com.googlecode.mp4parser:isoparser:1.1.22 (1 constraints: 38052d3b)
com.healthmarketscience.jackcess:jackcess:3.0.1 (1 constraints: 0605fb35)
com.healthmarketscience.jackcess:jackcess-encrypt:3.0.0 (1 constraints: 0505fa35)
com.ibm.icu:icu4j:62.2 (1 constraints: de040d31)
com.ibm.icu:icu4j:68.2 (1 constraints: e4041f31)
com.jayway.jsonpath:json-path:2.4.0 (1 constraints: 08050136)
com.lmax:disruptor:3.4.2 (1 constraints: 0b050836)
com.pff:java-libpst:0.8.1 (1 constraints: 0b050436)

View File

@ -16,7 +16,7 @@ com.googlecode.juniversalchardet:juniversalchardet=1.0.3
com.googlecode.mp4parser:isoparser=1.1.22
com.healthmarketscience.jackcess:jackcess-encrypt=3.0.0
com.healthmarketscience.jackcess:jackcess=3.0.1
com.ibm.icu:icu4j=62.2
com.ibm.icu:icu4j=68.2
com.jayway.jsonpath:json-path=2.4.0
com.lmax:disruptor=3.4.2
com.pff:java-libpst=0.8.1