mirror of https://github.com/apache/lucene.git
LUCENE-9773: upgrade icu to 68.2 (#2372)
Upgrade from icu 62.2 to 68.2, with Unicode 13 support. Modify GenerateUTR30DataFiles to take the release tag as a program argument. Gradle populates this automatically, removing a manual step from regeneration process.
This commit is contained in:
parent
ef920388e6
commit
dd91f5ca82
|
@ -51,12 +51,16 @@ configure(project(":lucene:analysis:icu")) {
|
|||
|
||||
doFirst {
|
||||
// all these steps must be done sequentially: it's a pipeline resulting in utr30.nrm
|
||||
def v = getVersion('com.ibm.icu', 'icu4j');
|
||||
project.javaexec {
|
||||
main = "org.apache.lucene.analysis.icu.GenerateUTR30DataFiles"
|
||||
classpath = sourceSets.tools.runtimeClasspath
|
||||
|
||||
ignoreExitValue false
|
||||
workingDir utr30DataDir
|
||||
args = [
|
||||
"release-${v.replace(".", "-")}"
|
||||
]
|
||||
}
|
||||
|
||||
project.exec {
|
||||
|
|
|
@ -256,6 +256,8 @@ Other
|
|||
* LUCENE-9627: Remove unused Lucene50FieldInfosFormat codec and small refactor some codecs
|
||||
to separate reading header/footer from reading content of the file. (Ignacio Vera)
|
||||
|
||||
* LUCENE-9773: Upgrade icu to 68.2 (Robert Muir)
|
||||
|
||||
======================= Lucene 8.9.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -56,6 +56,7 @@
|
|||
FE58>002D
|
||||
FE63>002D
|
||||
FF0D>002D
|
||||
10EAD>002D
|
||||
|
||||
## Greek letterforms folding (done by kd)
|
||||
|
||||
|
|
|
@ -76,6 +76,7 @@
|
|||
0AFD..0AFF>
|
||||
0B3C>
|
||||
0B4D>
|
||||
0B55>
|
||||
0BCD>
|
||||
0C4D>
|
||||
0CBC>
|
||||
|
@ -85,6 +86,7 @@
|
|||
0DCA>
|
||||
0E47..0E4C>
|
||||
0E4E>
|
||||
0EBA>
|
||||
0EC8..0ECC>
|
||||
0F18..0F19>
|
||||
0F35>
|
||||
|
@ -96,9 +98,12 @@
|
|||
0FC6>
|
||||
1037>
|
||||
1039..103A>
|
||||
1063..1064>
|
||||
1069..106D>
|
||||
1087..108D>
|
||||
108F>
|
||||
109A..109B>
|
||||
135D..135F>
|
||||
17C9..17D3>
|
||||
17DD>
|
||||
1939..193B>
|
||||
|
@ -135,8 +140,8 @@ A67C..A67D>
|
|||
A67F>
|
||||
A69C..A69D>
|
||||
A6F0..A6F1>
|
||||
A717..A721>
|
||||
A788>
|
||||
A700..A721>
|
||||
A788..A78A>
|
||||
A7F8..A7F9>
|
||||
A8C4>
|
||||
A8E0..A8F1>
|
||||
|
@ -149,6 +154,7 @@ AA7B..AA7D>
|
|||
AABF..AAC2>
|
||||
AAF6>
|
||||
AB5B..AB5F>
|
||||
AB69..AB6B>
|
||||
ABEC..ABED>
|
||||
FB1E>
|
||||
FE20..FE2F>
|
||||
|
@ -180,6 +186,9 @@ FFE3>
|
|||
116B6..116B7>
|
||||
1172B>
|
||||
11839..1183A>
|
||||
1193D..1193E>
|
||||
11943>
|
||||
119E0>
|
||||
11A34>
|
||||
11A47>
|
||||
11A99>
|
||||
|
@ -188,12 +197,16 @@ FFE3>
|
|||
11D44..11D45>
|
||||
11D97>
|
||||
16AF0..16AF4>
|
||||
16B30..16B36>
|
||||
16F8F..16F9F>
|
||||
16FF0..16FF1>
|
||||
1D167..1D169>
|
||||
1D16D..1D172>
|
||||
1D17B..1D182>
|
||||
1D185..1D18B>
|
||||
1D1AA..1D1AD>
|
||||
1E130..1E136>
|
||||
1E2EC..1E2EF>
|
||||
1E8D0..1E8D6>
|
||||
1E944..1E946>
|
||||
1E948..1E94A>
|
||||
|
|
|
@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
118E7>0037 # WARANG CITI DIGIT SEVEN
|
||||
118E8>0038 # WARANG CITI DIGIT EIGHT
|
||||
118E9>0039 # WARANG CITI DIGIT NINE
|
||||
11950>0030 # DIVES AKURU DIGIT ZERO
|
||||
11951>0031 # DIVES AKURU DIGIT ONE
|
||||
11952>0032 # DIVES AKURU DIGIT TWO
|
||||
11953>0033 # DIVES AKURU DIGIT THREE
|
||||
11954>0034 # DIVES AKURU DIGIT FOUR
|
||||
11955>0035 # DIVES AKURU DIGIT FIVE
|
||||
11956>0036 # DIVES AKURU DIGIT SIX
|
||||
11957>0037 # DIVES AKURU DIGIT SEVEN
|
||||
11958>0038 # DIVES AKURU DIGIT EIGHT
|
||||
11959>0039 # DIVES AKURU DIGIT NINE
|
||||
11C50>0030 # BHAIKSUKI DIGIT ZERO
|
||||
11C51>0031 # BHAIKSUKI DIGIT ONE
|
||||
11C52>0032 # BHAIKSUKI DIGIT TWO
|
||||
|
@ -630,6 +640,26 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
16B57>0037 # PAHAWH HMONG DIGIT SEVEN
|
||||
16B58>0038 # PAHAWH HMONG DIGIT EIGHT
|
||||
16B59>0039 # PAHAWH HMONG DIGIT NINE
|
||||
1E140>0030 # NYIAKENG PUACHUE HMONG DIGIT ZERO
|
||||
1E141>0031 # NYIAKENG PUACHUE HMONG DIGIT ONE
|
||||
1E142>0032 # NYIAKENG PUACHUE HMONG DIGIT TWO
|
||||
1E143>0033 # NYIAKENG PUACHUE HMONG DIGIT THREE
|
||||
1E144>0034 # NYIAKENG PUACHUE HMONG DIGIT FOUR
|
||||
1E145>0035 # NYIAKENG PUACHUE HMONG DIGIT FIVE
|
||||
1E146>0036 # NYIAKENG PUACHUE HMONG DIGIT SIX
|
||||
1E147>0037 # NYIAKENG PUACHUE HMONG DIGIT SEVEN
|
||||
1E148>0038 # NYIAKENG PUACHUE HMONG DIGIT EIGHT
|
||||
1E149>0039 # NYIAKENG PUACHUE HMONG DIGIT NINE
|
||||
1E2F0>0030 # WANCHO DIGIT ZERO
|
||||
1E2F1>0031 # WANCHO DIGIT ONE
|
||||
1E2F2>0032 # WANCHO DIGIT TWO
|
||||
1E2F3>0033 # WANCHO DIGIT THREE
|
||||
1E2F4>0034 # WANCHO DIGIT FOUR
|
||||
1E2F5>0035 # WANCHO DIGIT FIVE
|
||||
1E2F6>0036 # WANCHO DIGIT SIX
|
||||
1E2F7>0037 # WANCHO DIGIT SEVEN
|
||||
1E2F8>0038 # WANCHO DIGIT EIGHT
|
||||
1E2F9>0039 # WANCHO DIGIT NINE
|
||||
1E950>0030 # ADLAM DIGIT ZERO
|
||||
1E951>0031 # ADLAM DIGIT ONE
|
||||
1E952>0032 # ADLAM DIGIT TWO
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode 11.0.0
|
||||
* Unicode 13.0.0
|
||||
|
||||
# Canonical_Combining_Class (ccc) values
|
||||
0300..0314:230
|
||||
|
@ -176,6 +176,7 @@
|
|||
0E3A:9
|
||||
0E48..0E4B:107
|
||||
0EB8..0EB9:118
|
||||
0EBA:9
|
||||
0EC8..0ECB:122
|
||||
0F18..0F19:220
|
||||
0F35:220
|
||||
|
@ -211,6 +212,7 @@
|
|||
1AB5..1ABA:220
|
||||
1ABB..1ABC:230
|
||||
1ABD:220
|
||||
1ABF..1AC0:220
|
||||
1B34:7
|
||||
1B44:9
|
||||
1B6B:230
|
||||
|
@ -275,6 +277,7 @@ A674..A67D:230
|
|||
A69E..A69F:230
|
||||
A6F0..A6F1:230
|
||||
A806:9
|
||||
A82C:9
|
||||
A8C4:9
|
||||
A8E0..A8F1:230
|
||||
A92B..A92D:220
|
||||
|
@ -305,6 +308,7 @@ FE2E..FE2F:230
|
|||
10AE5:230
|
||||
10AE6:220
|
||||
10D24..10D27:230
|
||||
10EAB..10EAC:230
|
||||
10F46..10F47:220
|
||||
10F48..10F4A:230
|
||||
10F4B:220
|
||||
|
@ -340,6 +344,9 @@ FE2E..FE2F:230
|
|||
1172B:9
|
||||
11839:9
|
||||
1183A:7
|
||||
1193D..1193E:9
|
||||
11943:7
|
||||
119E0:9
|
||||
11A34:9
|
||||
11A47:9
|
||||
11A99:9
|
||||
|
@ -349,6 +356,7 @@ FE2E..FE2F:230
|
|||
11D97:9
|
||||
16AF0..16AF4:1
|
||||
16B30..16B36:230
|
||||
16FF0..16FF1:6
|
||||
1BC9E:1
|
||||
1D165..1D166:216
|
||||
1D167..1D169:1
|
||||
|
@ -364,6 +372,8 @@ FE2E..FE2F:230
|
|||
1E01B..1E021:230
|
||||
1E023..1E024:230
|
||||
1E026..1E02A:230
|
||||
1E130..1E136:230
|
||||
1E2EC..1E2EF:230
|
||||
1E8D0..1E8D6:220
|
||||
1E944..1E949:230
|
||||
1E94A:7
|
||||
|
@ -1874,6 +1884,7 @@ FB4E>05E4 05BF
|
|||
114BE=114B9 114BD
|
||||
115BA=115B8 115AF
|
||||
115BB=115B9 115AF
|
||||
11938=11935 11930
|
||||
1D15E>1D157 1D165
|
||||
1D15F>1D158 1D165
|
||||
1D160>1D15F 1D16E
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode 11.0.0
|
||||
* Unicode 13.0.0
|
||||
|
||||
00A0>0020
|
||||
00A8>0020 0308
|
||||
|
@ -1107,6 +1107,7 @@
|
|||
32FC>30F0
|
||||
32FD>30F1
|
||||
32FE>30F2
|
||||
32FF>4EE4 548C
|
||||
3300>30A2 30D1 30FC 30C8
|
||||
3301>30A2 30EB 30D5 30A1
|
||||
3302>30A2 30F3 30DA 30A2
|
||||
|
@ -1372,6 +1373,7 @@ AB5C>A727
|
|||
AB5D>AB37
|
||||
AB5E>026B
|
||||
AB5F>AB52
|
||||
AB69>028D
|
||||
FB00>0066 0066
|
||||
FB01>0066 0069
|
||||
FB02>0066 006C
|
||||
|
@ -3630,6 +3632,7 @@ FFEE>25CB
|
|||
1F14F>0057 0043
|
||||
1F16A>004D 0043
|
||||
1F16B>004D 0044
|
||||
1F16C>004D 0052
|
||||
1F190>0044 004A
|
||||
1F200>307B 304B
|
||||
1F201>30B3 30B3
|
||||
|
@ -3689,3 +3692,13 @@ FFEE>25CB
|
|||
1F248>3014 6557 3015
|
||||
1F250>5F97
|
||||
1F251>53EF
|
||||
1FBF0>0030
|
||||
1FBF1>0031
|
||||
1FBF2>0032
|
||||
1FBF3>0033
|
||||
1FBF4>0034
|
||||
1FBF5>0035
|
||||
1FBF6>0036
|
||||
1FBF7>0037
|
||||
1FBF8>0038
|
||||
1FBF9>0039
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
* Unicode 11.0.0
|
||||
* Unicode 13.0.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
|
@ -2082,6 +2082,7 @@
|
|||
32FC>30F0
|
||||
32FD>30F1
|
||||
32FE>30F2
|
||||
32FF>4EE4 548C
|
||||
3300>30A2 30D1 30FC 30C8
|
||||
3301>30A2 30EB 30D5 30A1
|
||||
3302>30A2 30F3 30DA 30A2
|
||||
|
@ -2450,12 +2451,23 @@ A7B3>AB53
|
|||
A7B4>A7B5
|
||||
A7B6>A7B7
|
||||
A7B8>A7B9
|
||||
A7BA>A7BB
|
||||
A7BC>A7BD
|
||||
A7BE>A7BF
|
||||
A7C2>A7C3
|
||||
A7C4>A794
|
||||
A7C5>0282
|
||||
A7C6>1D8E
|
||||
A7C7>A7C8
|
||||
A7C9>A7CA
|
||||
A7F5>A7F6
|
||||
A7F8>0127
|
||||
A7F9>0153
|
||||
AB5C>A727
|
||||
AB5D>AB37
|
||||
AB5E>026B
|
||||
AB5F>AB52
|
||||
AB69>028D
|
||||
AB70>13A0
|
||||
AB71>13A1
|
||||
AB72>13A2
|
||||
|
@ -5319,6 +5331,7 @@ FFF0..FFF8>
|
|||
1F14F>0077 0063
|
||||
1F16A>006D 0063
|
||||
1F16B>006D 0064
|
||||
1F16C>006D 0072
|
||||
1F190>0064 006A
|
||||
1F200>307B 304B
|
||||
1F201>30B3 30B3
|
||||
|
@ -5378,6 +5391,16 @@ FFF0..FFF8>
|
|||
1F248>3014 6557 3015
|
||||
1F250>5F97
|
||||
1F251>53EF
|
||||
1FBF0>0030
|
||||
1FBF1>0031
|
||||
1FBF2>0032
|
||||
1FBF3>0033
|
||||
1FBF4>0034
|
||||
1FBF5>0035
|
||||
1FBF6>0036
|
||||
1FBF7>0037
|
||||
1FBF8>0038
|
||||
1FBF9>0039
|
||||
2F800>4E3D
|
||||
2F801>4E38
|
||||
2F802>4E41
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -42,7 +42,7 @@ import java.util.regex.Pattern;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt
|
||||
* Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt for the specified icu release tag.
|
||||
*
|
||||
* <p>ASSUMPTION: This class will be run with current directory set to
|
||||
* lucene/analysis/icu/src/data/utr30/
|
||||
|
@ -56,7 +56,6 @@ import java.util.stream.Collectors;
|
|||
*/
|
||||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_GIT_TAG_URL = "https://raw.githubusercontent.com/unicode-org/icu";
|
||||
private static final String ICU_RELEASE_TAG = "maint/maint-62";
|
||||
private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
|
@ -74,7 +73,11 @@ public class GenerateUTR30DataFiles {
|
|||
|
||||
public static void main(String args[]) {
|
||||
try {
|
||||
getNFKCDataFilesFromIcuProject();
|
||||
if (args.length != 1) {
|
||||
throw new IllegalArgumentException(
|
||||
"usage: " + GenerateUTR30DataFiles.class.getName() + " <releaseTag>");
|
||||
}
|
||||
getNFKCDataFilesFromIcuProject(args[0]);
|
||||
expandRulesInUTR30DataFiles();
|
||||
} catch (Throwable t) {
|
||||
t.printStackTrace(System.err);
|
||||
|
@ -151,9 +154,9 @@ public class GenerateUTR30DataFiles {
|
|||
}
|
||||
}
|
||||
|
||||
private static void getNFKCDataFilesFromIcuProject() throws IOException {
|
||||
private static void getNFKCDataFilesFromIcuProject(String releaseTag) throws IOException {
|
||||
URL icuTagsURL = new URL(ICU_GIT_TAG_URL + "/");
|
||||
URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/");
|
||||
URL icuReleaseTagURL = new URL(icuTagsURL, releaseTag + "/");
|
||||
URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/");
|
||||
|
||||
System.err.print("Downloading " + NFKC_TXT + " ... ");
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
9ad0d915018dcbb394678a920d72f606cd1c7214
|
|
@ -0,0 +1 @@
|
|||
76893e6000401ace133a65262254be0ebe556d46
|
|
@ -1 +0,0 @@
|
|||
9ad0d915018dcbb394678a920d72f606cd1c7214
|
|
@ -0,0 +1 @@
|
|||
76893e6000401ace133a65262254be0ebe556d46
|
|
@ -23,7 +23,7 @@ com.googlecode.juniversalchardet:juniversalchardet:1.0.3 (1 constraints: 0605f33
|
|||
com.googlecode.mp4parser:isoparser:1.1.22 (1 constraints: 38052d3b)
|
||||
com.healthmarketscience.jackcess:jackcess:3.0.1 (1 constraints: 0605fb35)
|
||||
com.healthmarketscience.jackcess:jackcess-encrypt:3.0.0 (1 constraints: 0505fa35)
|
||||
com.ibm.icu:icu4j:62.2 (1 constraints: de040d31)
|
||||
com.ibm.icu:icu4j:68.2 (1 constraints: e4041f31)
|
||||
com.jayway.jsonpath:json-path:2.4.0 (1 constraints: 08050136)
|
||||
com.lmax:disruptor:3.4.2 (1 constraints: 0b050836)
|
||||
com.pff:java-libpst:0.8.1 (1 constraints: 0b050436)
|
||||
|
|
|
@ -16,7 +16,7 @@ com.googlecode.juniversalchardet:juniversalchardet=1.0.3
|
|||
com.googlecode.mp4parser:isoparser=1.1.22
|
||||
com.healthmarketscience.jackcess:jackcess-encrypt=3.0.0
|
||||
com.healthmarketscience.jackcess:jackcess=3.0.1
|
||||
com.ibm.icu:icu4j=62.2
|
||||
com.ibm.icu:icu4j=68.2
|
||||
com.jayway.jsonpath:json-path=2.4.0
|
||||
com.lmax:disruptor=3.4.2
|
||||
com.pff:java-libpst=0.8.1
|
||||
|
|
Loading…
Reference in New Issue