LUCENE-8366: Upgrade to ICU 62.1

2018-06-21 20:08:17 -04:00 · 2018-06-21 20:08:17 -04:00 · 2ea416ee3d
parent 5cd8aa4ff9
commit 2ea416ee3d
17 changed files with 194 additions and 108 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -123,6 +123,10 @@ Changes in Runtime Behavior:
 * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
  findForcedMerges and findForcedDeletesMerges (Erick Erickson)

+Other:
+
+* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's
+  Extended_Pictographic property. (Robert Muir)

 ======================= Lucene 7.4.1 =======================

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
@ -24,15 +24,15 @@ import org.apache.lucene.util.SparseFixedBitSet;

 /**
 * This file contains unicode properties used by various {@link CharTokenizer}s.
- * The data was created using ICU4J v61.1.0.0
+ * The data was created using ICU4J v62.1.0.0
 * <p>
- * Unicode version: 10.0.0.0
+ * Unicode version: 11.0.0.0
 */
 public final class UnicodeProps {
  private UnicodeProps() {}
  
  /** Unicode version that was used to generate this file: {@value} */
-  public static final String UNICODE_VERSION = "10.0.0.0";
+  public static final String UNICODE_VERSION = "11.0.0.0";
  
  /** Bitset with Unicode WHITESPACE code points. */
  public static final Bits WHITESPACE = createBits(
--- a/lucene/analysis/icu/src/data/uax29/Default.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Default.rbbi
@ -63,14 +63,10 @@ $MidNumLet          = [\p{Word_Break = MidNumLet}];
 $MidLetter          = [\p{Word_Break = MidLetter}];
 $MidNum             = [\p{Word_Break = MidNum}];
 $Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
-$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
-$E_Base             = [\p{Word_Break = EB}];
-$E_Modifier         = [\p{Word_Break = EM}];

-# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
-$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
-$EBG                = [\p{Word_Break = EBG}];
-$EmojiNRK           = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [:ExtPict:];

 $Han                = [:Han:];
 $Hiragana           = [:Hiragana:];
@ -115,17 +111,17 @@ $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;

 ## -------------------------------------------------

-!!forward;
-
-
 # Rule 3 - CR x LF
 #
 $CR $LF;

 # Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
 #
-$ZWJ ($Extended_Pict | $EmojiNRK);
+$ZWJ $Extended_Pict;

+# Rule 3d - Keep horizontal whitespace together.
+#
+$WSegSpace $WSegSpace;

 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
 #          of a region of Text.   The rule here comes into play when the start of text
@ -142,8 +138,6 @@ $KatakanaEx {300};       # note:  these status values override those from rule 5
 $HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #

-$E_Base ($Extend | $Format | $ZWJ)*;
-$E_Modifier ($Extend | $Format | $ZWJ)*;
 $Extended_Pict ($Extend | $Format | $ZWJ)*;

 #
@ -193,11 +187,6 @@ $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
 $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
 $ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)

-# rule 14
-#    Do not break within emoji modifier sequences
-
-($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
-
 # rules 15 - 17
 #    Pairs of Regional Indicators stay together.
 #    With rule chaining disabled by ^, this rule will match exactly two of them.
@ -211,24 +200,3 @@ $HangulSyllable $HangulSyllable {200};
 # Rule 999
 #     Match a single code point if no other rule applies.
 .;
-
-
-## -------------------------------------------------
-
-!!safe_reverse;
-
-# rule 3
-($Extend | $Format | $ZWJ)+ .?;
-
-# rule 6
-($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
-
-# rule 7b
-$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
-
-
-# rule 11
-($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
-
-# rule 13c
-$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@ -159,6 +159,8 @@ FF9E..FF9F>
 FFE3>
 102E0>
 10AE5..10AE6>
+10D22..10D27>
+10F46..10F50>
 110B9..110BA>
 11133..11134>
 11173>
@ -177,12 +179,14 @@ FFE3>
 1163F>
 116B6..116B7>
 1172B>
+11839..1183A>
 11A34>
 11A47>
 11A99>
 11C3F>
 11D42>
 11D44..11D45>
+11D97>
 16AF0..16AF4>
 16F8F..16F9F>
 1D167..1D169>
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@ -442,6 +442,16 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 10A41>0032   # KHAROSHTHI DIGIT TWO
 10A42>0033   # KHAROSHTHI DIGIT THREE
 10A43>0034   # KHAROSHTHI DIGIT FOUR
+10D30>0030   # HANIFI ROHINGYA DIGIT ZERO
+10D31>0031   # HANIFI ROHINGYA DIGIT ONE
+10D32>0032   # HANIFI ROHINGYA DIGIT TWO
+10D33>0033   # HANIFI ROHINGYA DIGIT THREE
+10D34>0034   # HANIFI ROHINGYA DIGIT FOUR
+10D35>0035   # HANIFI ROHINGYA DIGIT FIVE
+10D36>0036   # HANIFI ROHINGYA DIGIT SIX
+10D37>0037   # HANIFI ROHINGYA DIGIT SEVEN
+10D38>0038   # HANIFI ROHINGYA DIGIT EIGHT
+10D39>0039   # HANIFI ROHINGYA DIGIT NINE
 10E60>0031   # RUMI DIGIT ONE
 10E61>0032   # RUMI DIGIT TWO
 10E62>0033   # RUMI DIGIT THREE
@ -590,6 +600,16 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 11D57>0037   # MASARAM GONDI DIGIT SEVEN
 11D58>0038   # MASARAM GONDI DIGIT EIGHT
 11D59>0039   # MASARAM GONDI DIGIT NINE
+11DA0>0030   # GUNJALA GONDI DIGIT ZERO
+11DA1>0031   # GUNJALA GONDI DIGIT ONE
+11DA2>0032   # GUNJALA GONDI DIGIT TWO
+11DA3>0033   # GUNJALA GONDI DIGIT THREE
+11DA4>0034   # GUNJALA GONDI DIGIT FOUR
+11DA5>0035   # GUNJALA GONDI DIGIT FIVE
+11DA6>0036   # GUNJALA GONDI DIGIT SIX
+11DA7>0037   # GUNJALA GONDI DIGIT SEVEN
+11DA8>0038   # GUNJALA GONDI DIGIT EIGHT
+11DA9>0039   # GUNJALA GONDI DIGIT NINE
 16A60>0030   # MRO DIGIT ZERO
 16A61>0031   # MRO DIGIT ONE
 16A62>0032   # MRO DIGIT TWO
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
@ -16,8 +16,6 @@
 */
 package org.apache.lucene.analysis.icu.segmentation;

-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.text.BreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;
@ -65,18 +63,18 @@ final class BreakIteratorWrapper {
    }
  }
  
-  // See unicode doc L2/16-315 and also the RBBI rules for rationale.
-  // we don't include regional indicators here, because they aren't ambiguous for tagging,
-  // they need only be treated special for segmentation.
+  // See unicode doc L2/16-315 for rationale.
+  // basically for us the ambiguous cases (keycap/etc) as far as types go.
  static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
+  // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
+  static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze();

  /** Returns true if the current text represents emoji character or sequence */
  private boolean isEmoji(int current, int next) {
    int begin = start + current;
    int end = start + next;
    int codepoint = UTF16.charAt(text, 0, end, begin);
-    // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
-    if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
+    if (EMOJI.contains(codepoint)) {
      if (EMOJI_RK.contains(codepoint)) {
        // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
        // an emoji presentation selector or keycap follows.
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 10.0). However, some users who wish
+the most recent version of Unicode (currently 11.0). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi
@ -14,35 +14,71 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
-# Added dashes to $MidLetter, so that words aren't broken on single dashes.
+# This test is a modified Default.rbbi that adds hyphens to MidLetter
 #
+# Default.rbbi is from ICU (with some small modifications, to avoid CJK dictionary break,
+# and status code change related to that)
+#
+# Copyright (C) 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Copyright (C) 2002-2016, International Business Machines Corporation
+# and others. All Rights Reserved.
+#
+# file:  word.txt
+#
+# ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
+#      with additions for Emoji Sequences from https://goo.gl/cluFCn
+#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
+
+##############################################################################
+#
+#  Character class definitions from TR 29
+#
+##############################################################################

 !!chain;
+!!quoted_literals_only;
+

 #
 #  Character Class Definitions.
 #

-$CR           = [\p{Word_Break = CR}];
-$LF           = [\p{Word_Break = LF}];
-$Newline      = [\p{Word_Break = Newline}];
-$Extend       = [\p{Word_Break = Extend}];
-$Format       = [\p{Word_Break = Format}];
-$Katakana     = [\p{Word_Break = Katakana}];
-$ALetter      = [\p{Word_Break = ALetter}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline} ];
+$Extend             = [\p{Word_Break = Extend}];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
 # Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
-$Dash         = [\N{HYPHEN-MINUS}
-                 \N{HYPHEN}
-                 \N{EN DASH}
-                 \N{MINUS SIGN}
-                 \N{SMALL HYPHEN-MINUS}
-                 \N{FULLWIDTH HYPHEN-MINUS}];
-$MidLetter    = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$Dash               = [\N{HYPHEN-MINUS}
+                       \N{HYPHEN}
+                       \N{EN DASH}
+                       \N{MINUS SIGN}
+                       \N{SMALL HYPHEN-MINUS}
+                       \N{FULLWIDTH HYPHEN-MINUS}];
+$MidLetter          = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [:ExtPict:];
+
+$Han                = [:Han:];
+$Hiragana           = [:Hiragana:];


 #   Dictionary character set, for triggering language-based break engines. Currently
@ -50,58 +86,83 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.

-$dictionary   = [:LineBreak = Complex_Context:];
-$Control        = [\p{Grapheme_Cluster_Break = Control}];
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
-                                                             #  include the dictionary characters.
+$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
+$dictionary     = [$ComplexContext];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+

 #
-#  Rules 4    Ignore Format and Extend characters,
+#  Rules 4    Ignore Format and Extend characters, 
 #             except when they appear at the beginning of a region of text.
 #
-$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
-$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+$KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
+$NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;

-$Hiragana       = [\p{script=Hiragana}];
 $Ideographic    = [\p{Ideographic}];
-$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
-$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
+$HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
+$IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;

 ## -------------------------------------------------

-!!forward;
-
-
 # Rule 3 - CR x LF
 #
 $CR $LF;

+# Rule 3c   ZWJ x (Extended_Pict | EmojiNRK).  Precedes WB4, so no intervening Extend chars allowed.
+#
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
+#
+$WSegSpace $WSegSpace;
+
 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
-#          format char(s).
-[^$CR $LF $Newline]? ($Extend |  $Format)+;
+#          format char(s), or is not a CJK dictionary character.
+[^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;

 $NumericEx {100};
 $ALetterEx {200};
+$HangulSyllable {200};
+$Hebrew_LetterEx{200};
 $KatakanaEx {300};       # note:  these status values override those from rule 5
-$HiraganaEx {300};       #        by virtual of being numerically larger.
+$HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #

+$Extended_Pict ($Extend | $Format | $ZWJ)*;
+
 #
 # rule 5
 #    Do not break between most letters.
 #
-$ALetterEx $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};

 # rule 6 and 7
-$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
+
+# rule 7a
+$Hebrew_LetterEx $Single_QuoteEx {200};
+
+# rule 7b and 7c
+$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};

 # rule 8

@ -109,27 +170,43 @@ $NumericEx $NumericEx {100};

 # rule 9

-$ALetterEx $NumericEx {200};
+($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};

 # rule 10

-$NumericEx $ALetterEx {200};
+$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};

-# rule 11 and 12
+# rule 11 and 12 

-$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};

 # rule 13
-
 $KatakanaEx  $KatakanaEx {300};

 # rule 13a/b

-$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
-$NumericEx      $ExtendNumLetEx {100};    #  (13a)
-$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
-$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
+$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
+$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
+$NumericEx       $ExtendNumLetEx {100};    #  (13a)
+$KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
+$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
+
+$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
+$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
+$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
+$ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With rule chaining disabled by ^, this rule will match exactly two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
+#
+^$Regional_IndicatorEx $Regional_IndicatorEx;
+
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;

-$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
-$ExtendNumLetEx $NumericEx  {100};    #  (13b)
-$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;

 import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UnicodeSet;

 import java.io.IOException;
 import java.io.StringReader;
@ -316,6 +317,20 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
        new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
  }
  
+  public void testEmojiFromTheFuture() throws Exception {
+    // pick an unassigned character with extended_pictographic
+    int ch = new UnicodeSet("[[:Extended_Pictographic:]&[:Unassigned:]]").getRangeStart(0);
+    String value = new String(Character.toChars(ch));
+    // should analyze to emoji type
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, value,
+        new String[] { value },
+        new String[] { "<EMOJI>" });
+    // shouldn't break in a sequence
+    BaseTokenStreamTestCase.assertAnalyzesTo(a, value + '\u200D' + value,
+        new String[] { value + '\u200D' + value  },
+        new String[] { "<EMOJI>" });
+  }
+
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@ -30,7 +30,7 @@ com.fasterxml.jackson.core.version = 2.9.5
 /com.googlecode.mp4parser/isoparser = 1.1.18
 /com.healthmarketscience.jackcess/jackcess = 2.1.8
 /com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
-/com.ibm.icu/icu4j = 61.1
+/com.ibm.icu/icu4j = 62.1
 /com.lmax/disruptor = 3.4.0
 /com.pff/java-libpst = 0.8.1

--- a/lucene/licenses/icu4j-61.1.jar.sha1
+++ b/lucene/licenses/icu4j-61.1.jar.sha1
@ -1 +0,0 @@
-28d33b5e44e72edcc66a5da7a34a42147f38d987
--- a/lucene/licenses/icu4j-62.1.jar.sha1
+++ b/lucene/licenses/icu4j-62.1.jar.sha1
@ -0,0 +1 @@
+7a4d00d5ec5febd252a6182e8b6e87a0a9821f81
--- a/solr/licenses/icu4j-61.1.jar.sha1
+++ b/solr/licenses/icu4j-61.1.jar.sha1
@ -1 +0,0 @@
-28d33b5e44e72edcc66a5da7a34a42147f38d987
--- a/solr/licenses/icu4j-62.1.jar.sha1
+++ b/solr/licenses/icu4j-62.1.jar.sha1
@ -0,0 +1 @@
+7a4d00d5ec5febd252a6182e8b6e87a0a9821f81