diff --git a/gradle/generation/icu.gradle b/gradle/generation/icu.gradle index a68b0e189e5..039db07c3ea 100644 --- a/gradle/generation/icu.gradle +++ b/gradle/generation/icu.gradle @@ -51,12 +51,16 @@ configure(project(":lucene:analysis:icu")) { doFirst { // all these steps must be done sequentially: it's a pipeline resulting in utr30.nrm + def v = getVersion('com.ibm.icu', 'icu4j'); project.javaexec { main = "org.apache.lucene.analysis.icu.GenerateUTR30DataFiles" classpath = sourceSets.tools.runtimeClasspath ignoreExitValue false workingDir utr30DataDir + args = [ + "release-${v.replace(".", "-")}" + ] } project.exec { diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index dfb6946d66d..40d61c93848 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -256,6 +256,8 @@ Other * LUCENE-9627: Remove unused Lucene50FieldInfosFormat codec and small refactor some codecs to separate reading header/footer from reading content of the file. (Ignacio Vera) +* LUCENE-9773: Upgrade icu to 68.2 (Robert Muir) + ======================= Lucene 8.9.0 ======================= API Changes diff --git a/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt b/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt index 1246a87bcea..3a6fe718f77 100644 --- a/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt +++ b/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt @@ -56,6 +56,7 @@ FE58>002D FE63>002D FF0D>002D +10EAD>002D ## Greek letterforms folding (done by kd) diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt index ac6ac1eb05b..a6c7c4b352b 100644 --- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt +++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt @@ -76,6 +76,7 @@ 0AFD..0AFF> 0B3C> 0B4D> +0B55> 0BCD> 0C4D> 0CBC> @@ -85,6 +86,7 @@ 0DCA> 0E47..0E4C> 0E4E> +0EBA> 0EC8..0ECC> 0F18..0F19> 0F35> @@ -96,9 +98,12 @@ 0FC6> 1037> 1039..103A> +1063..1064> +1069..106D> 1087..108D> 108F> 109A..109B> +135D..135F> 17C9..17D3> 17DD> 1939..193B> @@ -135,8 +140,8 @@ A67C..A67D> A67F> A69C..A69D> A6F0..A6F1> -A717..A721> -A788> +A700..A721> +A788..A78A> A7F8..A7F9> A8C4> A8E0..A8F1> @@ -149,6 +154,7 @@ AA7B..AA7D> AABF..AAC2> AAF6> AB5B..AB5F> +AB69..AB6B> ABEC..ABED> FB1E> FE20..FE2F> @@ -180,6 +186,9 @@ FFE3> 116B6..116B7> 1172B> 11839..1183A> +1193D..1193E> +11943> +119E0> 11A34> 11A47> 11A99> @@ -188,12 +197,16 @@ FFE3> 11D44..11D45> 11D97> 16AF0..16AF4> +16B30..16B36> 16F8F..16F9F> +16FF0..16FF1> 1D167..1D169> 1D16D..1D172> 1D17B..1D182> 1D185..1D18B> 1D1AA..1D1AD> +1E130..1E136> +1E2EC..1E2EF> 1E8D0..1E8D6> 1E944..1E946> 1E948..1E94A> diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt index 382a2795601..c3c28fc9b20 100644 --- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt +++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt @@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE 118E7>0037 # WARANG CITI DIGIT SEVEN 118E8>0038 # WARANG CITI DIGIT EIGHT 118E9>0039 # WARANG CITI DIGIT NINE +11950>0030 # DIVES AKURU DIGIT ZERO +11951>0031 # DIVES AKURU DIGIT ONE +11952>0032 # DIVES AKURU DIGIT TWO +11953>0033 # DIVES AKURU DIGIT THREE +11954>0034 # DIVES AKURU DIGIT FOUR +11955>0035 # DIVES AKURU DIGIT FIVE +11956>0036 # DIVES AKURU DIGIT SIX +11957>0037 # DIVES AKURU DIGIT SEVEN +11958>0038 # DIVES AKURU DIGIT EIGHT +11959>0039 # DIVES AKURU DIGIT NINE 11C50>0030 # BHAIKSUKI DIGIT ZERO 11C51>0031 # BHAIKSUKI DIGIT ONE 11C52>0032 # BHAIKSUKI DIGIT TWO @@ -630,6 +640,26 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE 16B57>0037 # PAHAWH HMONG DIGIT SEVEN 16B58>0038 # PAHAWH HMONG DIGIT EIGHT 16B59>0039 # PAHAWH HMONG DIGIT NINE +1E140>0030 # NYIAKENG PUACHUE HMONG DIGIT ZERO +1E141>0031 # NYIAKENG PUACHUE HMONG DIGIT ONE +1E142>0032 # NYIAKENG PUACHUE HMONG DIGIT TWO +1E143>0033 # NYIAKENG PUACHUE HMONG DIGIT THREE +1E144>0034 # NYIAKENG PUACHUE HMONG DIGIT FOUR +1E145>0035 # NYIAKENG PUACHUE HMONG DIGIT FIVE +1E146>0036 # NYIAKENG PUACHUE HMONG DIGIT SIX +1E147>0037 # NYIAKENG PUACHUE HMONG DIGIT SEVEN +1E148>0038 # NYIAKENG PUACHUE HMONG DIGIT EIGHT +1E149>0039 # NYIAKENG PUACHUE HMONG DIGIT NINE +1E2F0>0030 # WANCHO DIGIT ZERO +1E2F1>0031 # WANCHO DIGIT ONE +1E2F2>0032 # WANCHO DIGIT TWO +1E2F3>0033 # WANCHO DIGIT THREE +1E2F4>0034 # WANCHO DIGIT FOUR +1E2F5>0035 # WANCHO DIGIT FIVE +1E2F6>0036 # WANCHO DIGIT SIX +1E2F7>0037 # WANCHO DIGIT SEVEN +1E2F8>0038 # WANCHO DIGIT EIGHT +1E2F9>0039 # WANCHO DIGIT NINE 1E950>0030 # ADLAM DIGIT ZERO 1E951>0031 # ADLAM DIGIT ONE 1E952>0032 # ADLAM DIGIT TWO diff --git a/lucene/analysis/icu/src/data/utr30/nfc.txt b/lucene/analysis/icu/src/data/utr30/nfc.txt index c143c1c5601..90fb17593c6 100644 --- a/lucene/analysis/icu/src/data/utr30/nfc.txt +++ b/lucene/analysis/icu/src/data/utr30/nfc.txt @@ -9,7 +9,7 @@ # # Complete data for Unicode NFC normalization. -* Unicode 11.0.0 +* Unicode 13.0.0 # Canonical_Combining_Class (ccc) values 0300..0314:230 @@ -176,6 +176,7 @@ 0E3A:9 0E48..0E4B:107 0EB8..0EB9:118 +0EBA:9 0EC8..0ECB:122 0F18..0F19:220 0F35:220 @@ -211,6 +212,7 @@ 1AB5..1ABA:220 1ABB..1ABC:230 1ABD:220 +1ABF..1AC0:220 1B34:7 1B44:9 1B6B:230 @@ -275,6 +277,7 @@ A674..A67D:230 A69E..A69F:230 A6F0..A6F1:230 A806:9 +A82C:9 A8C4:9 A8E0..A8F1:230 A92B..A92D:220 @@ -305,6 +308,7 @@ FE2E..FE2F:230 10AE5:230 10AE6:220 10D24..10D27:230 +10EAB..10EAC:230 10F46..10F47:220 10F48..10F4A:230 10F4B:220 @@ -340,6 +344,9 @@ FE2E..FE2F:230 1172B:9 11839:9 1183A:7 +1193D..1193E:9 +11943:7 +119E0:9 11A34:9 11A47:9 11A99:9 @@ -349,6 +356,7 @@ FE2E..FE2F:230 11D97:9 16AF0..16AF4:1 16B30..16B36:230 +16FF0..16FF1:6 1BC9E:1 1D165..1D166:216 1D167..1D169:1 @@ -364,6 +372,8 @@ FE2E..FE2F:230 1E01B..1E021:230 1E023..1E024:230 1E026..1E02A:230 +1E130..1E136:230 +1E2EC..1E2EF:230 1E8D0..1E8D6:220 1E944..1E949:230 1E94A:7 @@ -1874,6 +1884,7 @@ FB4E>05E4 05BF 114BE=114B9 114BD 115BA=115B8 115AF 115BB=115B9 115AF +11938=11935 11930 1D15E>1D157 1D165 1D15F>1D158 1D165 1D160>1D15F 1D16E diff --git a/lucene/analysis/icu/src/data/utr30/nfkc.txt b/lucene/analysis/icu/src/data/utr30/nfkc.txt index 416c6800ec2..675cd107aaa 100644 --- a/lucene/analysis/icu/src/data/utr30/nfkc.txt +++ b/lucene/analysis/icu/src/data/utr30/nfkc.txt @@ -13,7 +13,7 @@ # to NFKC one-way mappings. # Use this file as the second gennorm2 input file after nfc.txt. -* Unicode 11.0.0 +* Unicode 13.0.0 00A0>0020 00A8>0020 0308 @@ -1107,6 +1107,7 @@ 32FC>30F0 32FD>30F1 32FE>30F2 +32FF>4EE4 548C 3300>30A2 30D1 30FC 30C8 3301>30A2 30EB 30D5 30A1 3302>30A2 30F3 30DA 30A2 @@ -1372,6 +1373,7 @@ AB5C>A727 AB5D>AB37 AB5E>026B AB5F>AB52 +AB69>028D FB00>0066 0066 FB01>0066 0069 FB02>0066 006C @@ -3630,6 +3632,7 @@ FFEE>25CB 1F14F>0057 0043 1F16A>004D 0043 1F16B>004D 0044 +1F16C>004D 0052 1F190>0044 004A 1F200>307B 304B 1F201>30B3 30B3 @@ -3689,3 +3692,13 @@ FFEE>25CB 1F248>3014 6557 3015 1F250>5F97 1F251>53EF +1FBF0>0030 +1FBF1>0031 +1FBF2>0032 +1FBF3>0033 +1FBF4>0034 +1FBF5>0035 +1FBF6>0036 +1FBF7>0037 +1FBF8>0038 +1FBF9>0039 diff --git a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt index d37386e0506..21c441f945b 100644 --- a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt +++ b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt @@ -12,7 +12,7 @@ # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool. # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt. -* Unicode 11.0.0 +* Unicode 13.0.0 0041>0061 0042>0062 @@ -2082,6 +2082,7 @@ 32FC>30F0 32FD>30F1 32FE>30F2 +32FF>4EE4 548C 3300>30A2 30D1 30FC 30C8 3301>30A2 30EB 30D5 30A1 3302>30A2 30F3 30DA 30A2 @@ -2450,12 +2451,23 @@ A7B3>AB53 A7B4>A7B5 A7B6>A7B7 A7B8>A7B9 +A7BA>A7BB +A7BC>A7BD +A7BE>A7BF +A7C2>A7C3 +A7C4>A794 +A7C5>0282 +A7C6>1D8E +A7C7>A7C8 +A7C9>A7CA +A7F5>A7F6 A7F8>0127 A7F9>0153 AB5C>A727 AB5D>AB37 AB5E>026B AB5F>AB52 +AB69>028D AB70>13A0 AB71>13A1 AB72>13A2 @@ -5319,6 +5331,7 @@ FFF0..FFF8> 1F14F>0077 0063 1F16A>006D 0063 1F16B>006D 0064 +1F16C>006D 0072 1F190>0064 006A 1F200>307B 304B 1F201>30B3 30B3 @@ -5378,6 +5391,16 @@ FFF0..FFF8> 1F248>3014 6557 3015 1F250>5F97 1F251>53EF +1FBF0>0030 +1FBF1>0031 +1FBF2>0032 +1FBF3>0033 +1FBF4>0034 +1FBF5>0035 +1FBF6>0036 +1FBF7>0037 +1FBF8>0038 +1FBF9>0039 2F800>4E3D 2F801>4E38 2F802>4E41 diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk index 9333a40bec9..e10d24e07f1 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk index 7a138834e91..0e9dd26a897 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm index 4c31f905077..92a6919e37f 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java index ee5ea85d33b..9bf46a0aae7 100644 --- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java +++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java @@ -42,7 +42,7 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; /** - * Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt + * Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt for the specified icu release tag. * *
ASSUMPTION: This class will be run with current directory set to
* lucene/analysis/icu/src/data/utr30/
@@ -56,7 +56,6 @@ import java.util.stream.Collectors;
*/
public class GenerateUTR30DataFiles {
private static final String ICU_GIT_TAG_URL = "https://raw.githubusercontent.com/unicode-org/icu";
- private static final String ICU_RELEASE_TAG = "maint/maint-62";
private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
@@ -74,7 +73,11 @@ public class GenerateUTR30DataFiles {
public static void main(String args[]) {
try {
- getNFKCDataFilesFromIcuProject();
+ if (args.length != 1) {
+ throw new IllegalArgumentException(
+ "usage: " + GenerateUTR30DataFiles.class.getName() + "