diff --git a/gradle/generation/icu.gradle b/gradle/generation/icu.gradle index a68b0e189e5..039db07c3ea 100644 --- a/gradle/generation/icu.gradle +++ b/gradle/generation/icu.gradle @@ -51,12 +51,16 @@ configure(project(":lucene:analysis:icu")) { doFirst { // all these steps must be done sequentially: it's a pipeline resulting in utr30.nrm + def v = getVersion('com.ibm.icu', 'icu4j'); project.javaexec { main = "org.apache.lucene.analysis.icu.GenerateUTR30DataFiles" classpath = sourceSets.tools.runtimeClasspath ignoreExitValue false workingDir utr30DataDir + args = [ + "release-${v.replace(".", "-")}" + ] } project.exec { diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index dfb6946d66d..40d61c93848 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -256,6 +256,8 @@ Other * LUCENE-9627: Remove unused Lucene50FieldInfosFormat codec and small refactor some codecs to separate reading header/footer from reading content of the file. (Ignacio Vera) +* LUCENE-9773: Upgrade icu to 68.2 (Robert Muir) + ======================= Lucene 8.9.0 ======================= API Changes diff --git a/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt b/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt index 1246a87bcea..3a6fe718f77 100644 --- a/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt +++ b/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt @@ -56,6 +56,7 @@ FE58>002D FE63>002D FF0D>002D +10EAD>002D ## Greek letterforms folding (done by kd) diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt index ac6ac1eb05b..a6c7c4b352b 100644 --- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt +++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt @@ -76,6 +76,7 @@ 0AFD..0AFF> 0B3C> 0B4D> +0B55> 0BCD> 0C4D> 0CBC> @@ -85,6 +86,7 @@ 0DCA> 0E47..0E4C> 0E4E> +0EBA> 0EC8..0ECC> 0F18..0F19> 0F35> @@ -96,9 +98,12 @@ 0FC6> 1037> 1039..103A> +1063..1064> +1069..106D> 1087..108D> 108F> 109A..109B> +135D..135F> 17C9..17D3> 17DD> 1939..193B> @@ -135,8 +140,8 @@ A67C..A67D> A67F> A69C..A69D> A6F0..A6F1> -A717..A721> -A788> +A700..A721> +A788..A78A> A7F8..A7F9> A8C4> A8E0..A8F1> @@ -149,6 +154,7 @@ AA7B..AA7D> AABF..AAC2> AAF6> AB5B..AB5F> +AB69..AB6B> ABEC..ABED> FB1E> FE20..FE2F> @@ -180,6 +186,9 @@ FFE3> 116B6..116B7> 1172B> 11839..1183A> +1193D..1193E> +11943> +119E0> 11A34> 11A47> 11A99> @@ -188,12 +197,16 @@ FFE3> 11D44..11D45> 11D97> 16AF0..16AF4> +16B30..16B36> 16F8F..16F9F> +16FF0..16FF1> 1D167..1D169> 1D16D..1D172> 1D17B..1D182> 1D185..1D18B> 1D1AA..1D1AD> +1E130..1E136> +1E2EC..1E2EF> 1E8D0..1E8D6> 1E944..1E946> 1E948..1E94A> diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt index 382a2795601..c3c28fc9b20 100644 --- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt +++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt @@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE 118E7>0037 # WARANG CITI DIGIT SEVEN 118E8>0038 # WARANG CITI DIGIT EIGHT 118E9>0039 # WARANG CITI DIGIT NINE +11950>0030 # DIVES AKURU DIGIT ZERO +11951>0031 # DIVES AKURU DIGIT ONE +11952>0032 # DIVES AKURU DIGIT TWO +11953>0033 # DIVES AKURU DIGIT THREE +11954>0034 # DIVES AKURU DIGIT FOUR +11955>0035 # DIVES AKURU DIGIT FIVE +11956>0036 # DIVES AKURU DIGIT SIX +11957>0037 # DIVES AKURU DIGIT SEVEN +11958>0038 # DIVES AKURU DIGIT EIGHT +11959>0039 # DIVES AKURU DIGIT NINE 11C50>0030 # BHAIKSUKI DIGIT ZERO 11C51>0031 # BHAIKSUKI DIGIT ONE 11C52>0032 # BHAIKSUKI DIGIT TWO @@ -630,6 +640,26 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE 16B57>0037 # PAHAWH HMONG DIGIT SEVEN 16B58>0038 # PAHAWH HMONG DIGIT EIGHT 16B59>0039 # PAHAWH HMONG DIGIT NINE +1E140>0030 # NYIAKENG PUACHUE HMONG DIGIT ZERO +1E141>0031 # NYIAKENG PUACHUE HMONG DIGIT ONE +1E142>0032 # NYIAKENG PUACHUE HMONG DIGIT TWO +1E143>0033 # NYIAKENG PUACHUE HMONG DIGIT THREE +1E144>0034 # NYIAKENG PUACHUE HMONG DIGIT FOUR +1E145>0035 # NYIAKENG PUACHUE HMONG DIGIT FIVE +1E146>0036 # NYIAKENG PUACHUE HMONG DIGIT SIX +1E147>0037 # NYIAKENG PUACHUE HMONG DIGIT SEVEN +1E148>0038 # NYIAKENG PUACHUE HMONG DIGIT EIGHT +1E149>0039 # NYIAKENG PUACHUE HMONG DIGIT NINE +1E2F0>0030 # WANCHO DIGIT ZERO +1E2F1>0031 # WANCHO DIGIT ONE +1E2F2>0032 # WANCHO DIGIT TWO +1E2F3>0033 # WANCHO DIGIT THREE +1E2F4>0034 # WANCHO DIGIT FOUR +1E2F5>0035 # WANCHO DIGIT FIVE +1E2F6>0036 # WANCHO DIGIT SIX +1E2F7>0037 # WANCHO DIGIT SEVEN +1E2F8>0038 # WANCHO DIGIT EIGHT +1E2F9>0039 # WANCHO DIGIT NINE 1E950>0030 # ADLAM DIGIT ZERO 1E951>0031 # ADLAM DIGIT ONE 1E952>0032 # ADLAM DIGIT TWO diff --git a/lucene/analysis/icu/src/data/utr30/nfc.txt b/lucene/analysis/icu/src/data/utr30/nfc.txt index c143c1c5601..90fb17593c6 100644 --- a/lucene/analysis/icu/src/data/utr30/nfc.txt +++ b/lucene/analysis/icu/src/data/utr30/nfc.txt @@ -9,7 +9,7 @@ # # Complete data for Unicode NFC normalization. -* Unicode 11.0.0 +* Unicode 13.0.0 # Canonical_Combining_Class (ccc) values 0300..0314:230 @@ -176,6 +176,7 @@ 0E3A:9 0E48..0E4B:107 0EB8..0EB9:118 +0EBA:9 0EC8..0ECB:122 0F18..0F19:220 0F35:220 @@ -211,6 +212,7 @@ 1AB5..1ABA:220 1ABB..1ABC:230 1ABD:220 +1ABF..1AC0:220 1B34:7 1B44:9 1B6B:230 @@ -275,6 +277,7 @@ A674..A67D:230 A69E..A69F:230 A6F0..A6F1:230 A806:9 +A82C:9 A8C4:9 A8E0..A8F1:230 A92B..A92D:220 @@ -305,6 +308,7 @@ FE2E..FE2F:230 10AE5:230 10AE6:220 10D24..10D27:230 +10EAB..10EAC:230 10F46..10F47:220 10F48..10F4A:230 10F4B:220 @@ -340,6 +344,9 @@ FE2E..FE2F:230 1172B:9 11839:9 1183A:7 +1193D..1193E:9 +11943:7 +119E0:9 11A34:9 11A47:9 11A99:9 @@ -349,6 +356,7 @@ FE2E..FE2F:230 11D97:9 16AF0..16AF4:1 16B30..16B36:230 +16FF0..16FF1:6 1BC9E:1 1D165..1D166:216 1D167..1D169:1 @@ -364,6 +372,8 @@ FE2E..FE2F:230 1E01B..1E021:230 1E023..1E024:230 1E026..1E02A:230 +1E130..1E136:230 +1E2EC..1E2EF:230 1E8D0..1E8D6:220 1E944..1E949:230 1E94A:7 @@ -1874,6 +1884,7 @@ FB4E>05E4 05BF 114BE=114B9 114BD 115BA=115B8 115AF 115BB=115B9 115AF +11938=11935 11930 1D15E>1D157 1D165 1D15F>1D158 1D165 1D160>1D15F 1D16E diff --git a/lucene/analysis/icu/src/data/utr30/nfkc.txt b/lucene/analysis/icu/src/data/utr30/nfkc.txt index 416c6800ec2..675cd107aaa 100644 --- a/lucene/analysis/icu/src/data/utr30/nfkc.txt +++ b/lucene/analysis/icu/src/data/utr30/nfkc.txt @@ -13,7 +13,7 @@ # to NFKC one-way mappings. # Use this file as the second gennorm2 input file after nfc.txt. -* Unicode 11.0.0 +* Unicode 13.0.0 00A0>0020 00A8>0020 0308 @@ -1107,6 +1107,7 @@ 32FC>30F0 32FD>30F1 32FE>30F2 +32FF>4EE4 548C 3300>30A2 30D1 30FC 30C8 3301>30A2 30EB 30D5 30A1 3302>30A2 30F3 30DA 30A2 @@ -1372,6 +1373,7 @@ AB5C>A727 AB5D>AB37 AB5E>026B AB5F>AB52 +AB69>028D FB00>0066 0066 FB01>0066 0069 FB02>0066 006C @@ -3630,6 +3632,7 @@ FFEE>25CB 1F14F>0057 0043 1F16A>004D 0043 1F16B>004D 0044 +1F16C>004D 0052 1F190>0044 004A 1F200>307B 304B 1F201>30B3 30B3 @@ -3689,3 +3692,13 @@ FFEE>25CB 1F248>3014 6557 3015 1F250>5F97 1F251>53EF +1FBF0>0030 +1FBF1>0031 +1FBF2>0032 +1FBF3>0033 +1FBF4>0034 +1FBF5>0035 +1FBF6>0036 +1FBF7>0037 +1FBF8>0038 +1FBF9>0039 diff --git a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt index d37386e0506..21c441f945b 100644 --- a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt +++ b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt @@ -12,7 +12,7 @@ # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool. # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt. -* Unicode 11.0.0 +* Unicode 13.0.0 0041>0061 0042>0062 @@ -2082,6 +2082,7 @@ 32FC>30F0 32FD>30F1 32FE>30F2 +32FF>4EE4 548C 3300>30A2 30D1 30FC 30C8 3301>30A2 30EB 30D5 30A1 3302>30A2 30F3 30DA 30A2 @@ -2450,12 +2451,23 @@ A7B3>AB53 A7B4>A7B5 A7B6>A7B7 A7B8>A7B9 +A7BA>A7BB +A7BC>A7BD +A7BE>A7BF +A7C2>A7C3 +A7C4>A794 +A7C5>0282 +A7C6>1D8E +A7C7>A7C8 +A7C9>A7CA +A7F5>A7F6 A7F8>0127 A7F9>0153 AB5C>A727 AB5D>AB37 AB5E>026B AB5F>AB52 +AB69>028D AB70>13A0 AB71>13A1 AB72>13A2 @@ -5319,6 +5331,7 @@ FFF0..FFF8> 1F14F>0077 0063 1F16A>006D 0063 1F16B>006D 0064 +1F16C>006D 0072 1F190>0064 006A 1F200>307B 304B 1F201>30B3 30B3 @@ -5378,6 +5391,16 @@ FFF0..FFF8> 1F248>3014 6557 3015 1F250>5F97 1F251>53EF +1FBF0>0030 +1FBF1>0031 +1FBF2>0032 +1FBF3>0033 +1FBF4>0034 +1FBF5>0035 +1FBF6>0036 +1FBF7>0037 +1FBF8>0038 +1FBF9>0039 2F800>4E3D 2F801>4E38 2F802>4E41 diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk index 9333a40bec9..e10d24e07f1 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk index 7a138834e91..0e9dd26a897 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm index 4c31f905077..92a6919e37f 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java index ee5ea85d33b..9bf46a0aae7 100644 --- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java +++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java @@ -42,7 +42,7 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; /** - * Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt + * Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt for the specified icu release tag. * *

ASSUMPTION: This class will be run with current directory set to * lucene/analysis/icu/src/data/utr30/ @@ -56,7 +56,6 @@ import java.util.stream.Collectors; */ public class GenerateUTR30DataFiles { private static final String ICU_GIT_TAG_URL = "https://raw.githubusercontent.com/unicode-org/icu"; - private static final String ICU_RELEASE_TAG = "maint/maint-62"; private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2"; private static final String NFC_TXT = "nfc.txt"; private static final String NFKC_TXT = "nfkc.txt"; @@ -74,7 +73,11 @@ public class GenerateUTR30DataFiles { public static void main(String args[]) { try { - getNFKCDataFilesFromIcuProject(); + if (args.length != 1) { + throw new IllegalArgumentException( + "usage: " + GenerateUTR30DataFiles.class.getName() + " "); + } + getNFKCDataFilesFromIcuProject(args[0]); expandRulesInUTR30DataFiles(); } catch (Throwable t) { t.printStackTrace(System.err); @@ -151,9 +154,9 @@ public class GenerateUTR30DataFiles { } } - private static void getNFKCDataFilesFromIcuProject() throws IOException { + private static void getNFKCDataFilesFromIcuProject(String releaseTag) throws IOException { URL icuTagsURL = new URL(ICU_GIT_TAG_URL + "/"); - URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/"); + URL icuReleaseTagURL = new URL(icuTagsURL, releaseTag + "/"); URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/"); System.err.print("Downloading " + NFKC_TXT + " ... "); diff --git a/lucene/licenses/icu4j-62.2.jar.sha1 b/lucene/licenses/icu4j-62.2.jar.sha1 deleted file mode 100644 index 9691f210c8a..00000000000 --- a/lucene/licenses/icu4j-62.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9ad0d915018dcbb394678a920d72f606cd1c7214 diff --git a/lucene/licenses/icu4j-68.2.jar.sha1 b/lucene/licenses/icu4j-68.2.jar.sha1 new file mode 100644 index 00000000000..3b682a5877d --- /dev/null +++ b/lucene/licenses/icu4j-68.2.jar.sha1 @@ -0,0 +1 @@ +76893e6000401ace133a65262254be0ebe556d46 diff --git a/solr/licenses/icu4j-62.2.jar.sha1 b/solr/licenses/icu4j-62.2.jar.sha1 deleted file mode 100644 index 9691f210c8a..00000000000 --- a/solr/licenses/icu4j-62.2.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -9ad0d915018dcbb394678a920d72f606cd1c7214 diff --git a/solr/licenses/icu4j-68.2.jar.sha1 b/solr/licenses/icu4j-68.2.jar.sha1 new file mode 100644 index 00000000000..3b682a5877d --- /dev/null +++ b/solr/licenses/icu4j-68.2.jar.sha1 @@ -0,0 +1 @@ +76893e6000401ace133a65262254be0ebe556d46 diff --git a/versions.lock b/versions.lock index b738b791bba..433f98b09cb 100644 --- a/versions.lock +++ b/versions.lock @@ -23,7 +23,7 @@ com.googlecode.juniversalchardet:juniversalchardet:1.0.3 (1 constraints: 0605f33 com.googlecode.mp4parser:isoparser:1.1.22 (1 constraints: 38052d3b) com.healthmarketscience.jackcess:jackcess:3.0.1 (1 constraints: 0605fb35) com.healthmarketscience.jackcess:jackcess-encrypt:3.0.0 (1 constraints: 0505fa35) -com.ibm.icu:icu4j:62.2 (1 constraints: de040d31) +com.ibm.icu:icu4j:68.2 (1 constraints: e4041f31) com.jayway.jsonpath:json-path:2.4.0 (1 constraints: 08050136) com.lmax:disruptor:3.4.2 (1 constraints: 0b050836) com.pff:java-libpst:0.8.1 (1 constraints: 0b050436) diff --git a/versions.props b/versions.props index ab51fb6f022..01402e8afa7 100644 --- a/versions.props +++ b/versions.props @@ -16,7 +16,7 @@ com.googlecode.juniversalchardet:juniversalchardet=1.0.3 com.googlecode.mp4parser:isoparser=1.1.22 com.healthmarketscience.jackcess:jackcess-encrypt=3.0.0 com.healthmarketscience.jackcess:jackcess=3.0.1 -com.ibm.icu:icu4j=62.2 +com.ibm.icu:icu4j=68.2 com.jayway.jsonpath:json-path=2.4.0 com.lmax:disruptor=3.4.2 com.pff:java-libpst=0.8.1