Revert "LUCENE-8122: Upgrade analysis/icu to ICU 60.2"

This reverts commit 07407a5b53.
2018-02-20 14:40:53 +01:00 · 2018-02-20 14:40:53 +01:00 · cc1efdb4a3
parent 9a7b56b9df
commit cc1efdb4a3
18 changed files with 40 additions and 128 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -186,8 +186,6 @@ Other
 * LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.
  (Kai Chan via Adrien Grand)

-* LUCENE-8122: Upgrade analysis/icu to ICU 60.2. (Robert Muir)
-
 * LUCENE-8106: Add script (reproduceJenkinsFailures.py) to attempt to reproduce
  failing tests from a Jenkins log. (Steve Rowe)

--- a/lucene/analysis/icu/src/data/uax29/Default.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Default.rbbi
@ -14,21 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
-# and status code change related to that)
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
 #
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (C) 2002-2016, International Business Machines Corporation
+# Copyright (C) 2002-2013, International Business Machines Corporation 
 # and others. All Rights Reserved.
 #
 # file:  word.txt
 #
 # ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.
@ -40,7 +35,6 @@
 ##############################################################################

 !!chain;
-!!quoted_literals_only;


 #
@ -49,9 +43,8 @@

 $CR                 = [\p{Word_Break = CR}];
 $LF                 = [\p{Word_Break = LF}];
-$Newline            = [\p{Word_Break = Newline} ];
+$Newline            = [\p{Word_Break = Newline}];
 $Extend             = [\p{Word_Break = Extend}];
-$ZWJ                = [\p{Word_Break = ZWJ}];
 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
 $Format             = [\p{Word_Break = Format}];
 $Katakana           = [\p{Word_Break = Katakana}];
@ -64,13 +57,6 @@ $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
-$E_Base             = [\p{Word_Break = EB}];
-$E_Modifier         = [\p{Word_Break = EM}];
-
-# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
-$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
-$EBG                = [\p{Word_Break = EBG}];
-$EmojiNRK           = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];

 $Han                = [:Han:];
 $Hiragana           = [:Hiragana:];
@ -97,21 +83,21 @@ $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
 #             except when they appear at the beginning of a region of text.
 #
 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
-$KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
-$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
-$ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
-$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
-$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
-$MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
-$MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
-$MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
-$NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
-$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
-$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
+$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format)*;
+$NumericEx            = $Numeric            ($Extend |  $Format)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;

 $Ideographic    = [\p{Ideographic}];
-$HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
-$IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
+$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
+$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;

 ## -------------------------------------------------

@ -122,17 +108,12 @@ $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
 #
 $CR $LF;

-# Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
-#
-$ZWJ ($Extended_Pict | $EmojiNRK);
-
-
 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
 #          format char(s), or is not a CJK dictionary character.
-[^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
+[^$CR $LF $Newline]? ($Extend |  $Format)+;

 $NumericEx {100};
 $ALetterEx {200};
@ -142,10 +123,6 @@ $KatakanaEx {300};       # note:  these status values override those from rule 5
 $HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #

-$E_Base ($Extend | $Format | $ZWJ)*;
-$E_Modifier ($Extend | $Format | $ZWJ)*;
-$Extended_Pict ($Extend | $Format | $ZWJ)*;
-
 #
 # rule 5
 #    Do not break between most letters.
@ -193,42 +170,9 @@ $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
 $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
 $ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)

-# rule 14
-#    Do not break within emoji modifier sequences
+# rule 13c

-($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
-
-# rules 15 - 17
-#    Pairs of Regional Indicators stay together.
-#    With rule chaining disabled by ^, this rule will match exactly two of them.
-#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
-#
-^$Regional_IndicatorEx $Regional_IndicatorEx;
+$Regional_IndicatorEx $Regional_IndicatorEx;

 # special handling for CJK characters: chain for later dictionary segmentation
 $HangulSyllable $HangulSyllable {200};
-
-# Rule 999
-#     Match a single code point if no other rule applies.
-.;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@ -73,14 +73,12 @@
 0A4D>
 0ABC>
 0ACD>
-0AFD..0AFF>
 0B3C>
 0B4D>
 0BCD>
 0C4D>
 0CBC>
 0CCD>
-0D3B..0D3C>
 0D4D>
 0DCA>
 0E47..0E4C>
@ -114,10 +112,10 @@
 1CD0..1CE8>
 1CED>
 1CF4>
-1CF7..1CF9>
+1CF8..1CF9>
 1D2C..1D6A>
 1DC4..1DCF>
-1DF5..1DF9>
+1DF5>
 1DFD..1DFF>
 1FBD>
 1FBF..1FC1>
@ -177,12 +175,7 @@ FFE3>
 1163F>
 116B6..116B7>
 1172B>
-11A34>
-11A47>
-11A99>
 11C3F>
-11D42>
-11D44..11D45>
 16AF0..16AF4>
 16F8F..16F9F>
 1D167..1D169>
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@ -580,16 +580,6 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 11C57>0037   # BHAIKSUKI DIGIT SEVEN
 11C58>0038   # BHAIKSUKI DIGIT EIGHT
 11C59>0039   # BHAIKSUKI DIGIT NINE
-11D50>0030   # MASARAM GONDI DIGIT ZERO
-11D51>0031   # MASARAM GONDI DIGIT ONE
-11D52>0032   # MASARAM GONDI DIGIT TWO
-11D53>0033   # MASARAM GONDI DIGIT THREE
-11D54>0034   # MASARAM GONDI DIGIT FOUR
-11D55>0035   # MASARAM GONDI DIGIT FIVE
-11D56>0036   # MASARAM GONDI DIGIT SIX
-11D57>0037   # MASARAM GONDI DIGIT SEVEN
-11D58>0038   # MASARAM GONDI DIGIT EIGHT
-11D59>0039   # MASARAM GONDI DIGIT NINE
 16A60>0030   # MRO DIGIT ZERO
 16A61>0031   # MRO DIGIT ONE
 16A62>0032   # MRO DIGIT TWO
--- a/lucene/analysis/icu/src/data/utr30/nfc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfc.txt
@ -1,5 +1,3 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
 # Copyright (C) 1999-2016, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@ -9,7 +7,7 @@
 #
 # Complete data for Unicode NFC normalization.

-* Unicode 10.0.0
+* Unicode 9.0.0

 # Canonical_Combining_Class (ccc) values
 0300..0314:230
@ -166,7 +164,6 @@
 0C56:91
 0CBC:7
 0CCD:9
-0D3B..0D3C:9
 0D4D:9
 0DCA:9
 0E38..0E39:103
@ -237,9 +234,6 @@
 1DCF:220
 1DD0:202
 1DD1..1DF5:230
-1DF6:232
-1DF7..1DF8:228
-1DF9:220
 1DFB:230
 1DFC:233
 1DFD:220
@ -328,12 +322,7 @@ FE2E..FE2F:230
 116B6:9
 116B7:7
 1172B:9
-11A34:9
-11A47:9
-11A99:9
 11C3F:9
-11D42:7
-11D44..11D45:9
 16AF0..16AF4:1
 16B30..16B36:230
 1BC9E:1
--- a/lucene/analysis/icu/src/data/utr30/nfkc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc.txt
@ -1,5 +1,3 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
 # Copyright (C) 1999-2016, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
@ -13,7 +11,7 @@
 # to NFKC one-way mappings.
 # Use this file as the second gennorm2 input file after nfc.txt.

-* Unicode 10.0.0
+* Unicode 9.0.0

 00A0>0020
 00A8>0020 0308
--- a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
@ -1,7 +1,7 @@
-# Copyright (C) 2016 and later: Unicode, Inc. and others.
-# License & terms of use: http://www.unicode.org/copyright.html
-# Copyright (C) 1999-2016, International Business Machines
-# Corporation and others.  All Rights Reserved.
+# Unicode Character Database
+# Copyright (c) 1991-2016 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+# For documentation, see http://www.unicode.org/reports/tr44/
 #
 # file name: nfkc_cf.txt
 #
@ -12,7 +12,7 @@
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
 # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.

-* Unicode 10.0.0
+* Unicode 9.0.0

 0041>0061
 0042>0062
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer {
   */
  private boolean incrementTokenBuffer() {
    int start = breaker.current();
-    assert start != BreakIterator.DONE;
+    if (start == BreakIterator.DONE)
+      return false; // BreakIterator exhausted

    // find the next set of boundaries, skipping over non-tokens (rule status 0)
    int end = breaker.next();
-    while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
+    while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
      start = end;
      end = breaker.next();
    }

-    if (end == BreakIterator.DONE) {
+    if (start == BreakIterator.DONE)
      return false; // BreakIterator exhausted
-    }

    termAtt.copyBuffer(buffer, start, end - start);
    offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 10.0). However, some users who wish
+the most recent version of Unicode (currently 8.0). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
--- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
+++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
@ -62,9 +62,9 @@ import java.util.regex.Pattern;
 */
 public class GenerateUTR30DataFiles {
  private static final String ICU_SVN_TAG_URL
-      = "http://source.icu-project.org/repos/icu/tags";
-  private static final String ICU_RELEASE_TAG = "release-60-2";
-  private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
+      = "http://source.icu-project.org/repos/icu/icu/tags";
+  private static final String ICU_RELEASE_TAG = "release-58-1";
+  private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
  private static final String NFC_TXT = "nfc.txt";
  private static final String NFKC_TXT = "nfkc.txt";
  private static final String NFKC_CF_TXT = "nfkc_cf.txt";
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@ -31,7 +31,7 @@ com.fasterxml.jackson.core.version = 2.5.4
 /com.googlecode.mp4parser/isoparser = 1.1.18
 /com.healthmarketscience.jackcess/jackcess = 2.1.8
 /com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
-/com.ibm.icu/icu4j = 60.2
+/com.ibm.icu/icu4j = 59.1
 /com.pff/java-libpst = 0.8.1

 com.rometools.version = 1.5.1
--- a/lucene/licenses/icu4j-59.1.jar.sha1
+++ b/lucene/licenses/icu4j-59.1.jar.sha1
@ -0,0 +1 @@
+6f06e820cf4c8968bbbaae66ae0b33f6a256b57f
--- a/lucene/licenses/icu4j-60.2.jar.sha1
+++ b/lucene/licenses/icu4j-60.2.jar.sha1
@ -1 +0,0 @@
-e452cba3caaf93b997ff543c7246a6da74ed70f1
--- a/solr/licenses/icu4j-59.1.jar.sha1
+++ b/solr/licenses/icu4j-59.1.jar.sha1
@ -0,0 +1 @@
+6f06e820cf4c8968bbbaae66ae0b33f6a256b57f
--- a/solr/licenses/icu4j-60.2.jar.sha1
+++ b/solr/licenses/icu4j-60.2.jar.sha1
@ -1 +0,0 @@
-e452cba3caaf93b997ff543c7246a6da74ed70f1