diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e6125cec330..477f2610752 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,6 +123,10 @@ Changes in Runtime Behavior: * LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing findForcedMerges and findForcedDeletesMerges (Erick Erickson) +Other: + +* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's + Extended_Pictographic property. (Robert Muir) ======================= Lucene 7.4.1 ======================= diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java index 86870d0becf..3e1006546bc 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java @@ -24,15 +24,15 @@ import org.apache.lucene.util.SparseFixedBitSet; /** * This file contains unicode properties used by various {@link CharTokenizer}s. - * The data was created using ICU4J v61.1.0.0 + * The data was created using ICU4J v62.1.0.0 *

- * Unicode version: 10.0.0.0 + * Unicode version: 11.0.0.0 */ public final class UnicodeProps { private UnicodeProps() {} /** Unicode version that was used to generate this file: {@value} */ - public static final String UNICODE_VERSION = "10.0.0.0"; + public static final String UNICODE_VERSION = "11.0.0.0"; /** Bitset with Unicode WHITESPACE code points. */ public static final Bits WHITESPACE = createBits( diff --git a/lucene/analysis/icu/src/data/uax29/Default.rbbi b/lucene/analysis/icu/src/data/uax29/Default.rbbi index afda68f47b5..b87fe7fcbf1 100644 --- a/lucene/analysis/icu/src/data/uax29/Default.rbbi +++ b/lucene/analysis/icu/src/data/uax29/Default.rbbi @@ -63,14 +63,10 @@ $MidNumLet = [\p{Word_Break = MidNumLet}]; $MidLetter = [\p{Word_Break = MidLetter}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -$E_Base = [\p{Word_Break = EB}]; -$E_Modifier = [\p{Word_Break = EM}]; -# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 -$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF]; -$EBG = [\p{Word_Break = EBG}]; -$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [:ExtPict:]; $Han = [:Han:]; $Hiragana = [:Hiragana:]; @@ -115,17 +111,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; ## ------------------------------------------------- -!!forward; - - # Rule 3 - CR x LF # $CR $LF; # Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. # -$ZWJ ($Extended_Pict | $EmojiNRK); +$ZWJ $Extended_Pict; +# Rule 3d - Keep horizontal whitespace together. +# +$WSegSpace $WSegSpace; # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning # of a region of Text. The rule here comes into play when the start of text @@ -142,8 +138,6 @@ $KatakanaEx {300}; # note: these status values override those from rule 5 $HiraganaEx {300}; # by virtue of being numerically larger. $IdeographicEx {400}; # -$E_Base ($Extend | $Format | $ZWJ)*; -$E_Modifier ($Extend | $Format | $ZWJ)*; $Extended_Pict ($Extend | $Format | $ZWJ)*; # @@ -193,11 +187,6 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b) $ExtendNumLetEx $NumericEx {100}; # (13b) $ExtendNumLetEx $KatakanaEx {300}; # (13b) -# rule 14 -# Do not break within emoji modifier sequences - -($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier; - # rules 15 - 17 # Pairs of Regional Indicators stay together. # With rule chaining disabled by ^, this rule will match exactly two of them. @@ -211,24 +200,3 @@ $HangulSyllable $HangulSyllable {200}; # Rule 999 # Match a single code point if no other rule applies. .; - - -## ------------------------------------------------- - -!!safe_reverse; - -# rule 3 -($Extend | $Format | $ZWJ)+ .?; - -# rule 6 -($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus); - -# rule 7b -$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter; - - -# rule 11 -($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric; - -# rule 13c -$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator; diff --git a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt index 806a4f9baf8..ac6ac1eb05b 100644 --- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt +++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt @@ -159,6 +159,8 @@ FF9E..FF9F> FFE3> 102E0> 10AE5..10AE6> +10D22..10D27> +10F46..10F50> 110B9..110BA> 11133..11134> 11173> @@ -177,12 +179,14 @@ FFE3> 1163F> 116B6..116B7> 1172B> +11839..1183A> 11A34> 11A47> 11A99> 11C3F> 11D42> 11D44..11D45> +11D97> 16AF0..16AF4> 16F8F..16F9F> 1D167..1D169> diff --git a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt index 707674e299d..382a2795601 100644 --- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt +++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt @@ -442,6 +442,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE 10A41>0032 # KHAROSHTHI DIGIT TWO 10A42>0033 # KHAROSHTHI DIGIT THREE 10A43>0034 # KHAROSHTHI DIGIT FOUR +10D30>0030 # HANIFI ROHINGYA DIGIT ZERO +10D31>0031 # HANIFI ROHINGYA DIGIT ONE +10D32>0032 # HANIFI ROHINGYA DIGIT TWO +10D33>0033 # HANIFI ROHINGYA DIGIT THREE +10D34>0034 # HANIFI ROHINGYA DIGIT FOUR +10D35>0035 # HANIFI ROHINGYA DIGIT FIVE +10D36>0036 # HANIFI ROHINGYA DIGIT SIX +10D37>0037 # HANIFI ROHINGYA DIGIT SEVEN +10D38>0038 # HANIFI ROHINGYA DIGIT EIGHT +10D39>0039 # HANIFI ROHINGYA DIGIT NINE 10E60>0031 # RUMI DIGIT ONE 10E61>0032 # RUMI DIGIT TWO 10E62>0033 # RUMI DIGIT THREE @@ -590,6 +600,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE 11D57>0037 # MASARAM GONDI DIGIT SEVEN 11D58>0038 # MASARAM GONDI DIGIT EIGHT 11D59>0039 # MASARAM GONDI DIGIT NINE +11DA0>0030 # GUNJALA GONDI DIGIT ZERO +11DA1>0031 # GUNJALA GONDI DIGIT ONE +11DA2>0032 # GUNJALA GONDI DIGIT TWO +11DA3>0033 # GUNJALA GONDI DIGIT THREE +11DA4>0034 # GUNJALA GONDI DIGIT FOUR +11DA5>0035 # GUNJALA GONDI DIGIT FIVE +11DA6>0036 # GUNJALA GONDI DIGIT SIX +11DA7>0037 # GUNJALA GONDI DIGIT SEVEN +11DA8>0038 # GUNJALA GONDI DIGIT EIGHT +11DA9>0039 # GUNJALA GONDI DIGIT NINE 16A60>0030 # MRO DIGIT ZERO 16A61>0031 # MRO DIGIT ONE 16A62>0032 # MRO DIGIT TWO diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java index 9e5050d55b8..e21d2fdee92 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java @@ -16,8 +16,6 @@ */ package org.apache.lucene.analysis.icu.segmentation; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.UTF16; @@ -65,18 +63,18 @@ final class BreakIteratorWrapper { } } - // See unicode doc L2/16-315 and also the RBBI rules for rationale. - // we don't include regional indicators here, because they aren't ambiguous for tagging, - // they need only be treated special for segmentation. + // See unicode doc L2/16-315 for rationale. + // basically for us the ambiguous cases (keycap/etc) as far as types go. static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze(); + // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram + static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze(); /** Returns true if the current text represents emoji character or sequence */ private boolean isEmoji(int current, int next) { int begin = start + current; int end = start + next; int codepoint = UTF16.charAt(text, 0, end, begin); - // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:] - if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) { + if (EMOJI.contains(codepoint)) { if (EMOJI_RK.contains(codepoint)) { // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, // an emoji presentation selector or keycap follows. diff --git a/lucene/analysis/icu/src/java/overview.html b/lucene/analysis/icu/src/java/overview.html index 6fa5821c242..6e0a5d70f58 100644 --- a/lucene/analysis/icu/src/java/overview.html +++ b/lucene/analysis/icu/src/java/overview.html @@ -353,7 +353,7 @@ and

Backwards Compatibility

This module exists to provide up-to-date Unicode functionality that supports -the most recent version of Unicode (currently 10.0). However, some users who wish +the most recent version of Unicode (currently 11.0). However, some users who wish for stronger backwards compatibility can restrict {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk index e6c0ef770e7..9333a40bec9 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk index d6bfdb4ebf3..7a138834e91 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/MyanmarSyllable.brk differ diff --git a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm index 1c3de121cad..4ded6c97762 100644 Binary files a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm and b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm differ diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi index 0a4f0686a4f..caa784d1734 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/Latin-dont-break-on-hyphens.rbbi @@ -14,35 +14,71 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Based on Default.rbbi, the default RBBI rules, based on UAX#29. -# Added dashes to $MidLetter, so that words aren't broken on single dashes. +# This test is a modified Default.rbbi that adds hyphens to MidLetter # +# Default.rbbi is from ICU (with some small modifications, to avoid CJK dictionary break, +# and status code change related to that) +# +# Copyright (C) 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Copyright (C) 2002-2016, International Business Machines Corporation +# and others. All Rights Reserved. +# +# file: word.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 +# with additions for Emoji Sequences from https://goo.gl/cluFCn +# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html +# +# Note: Updates to word.txt will usually need to be merged into +# word_POSIX.txt also. + +############################################################################## +# +# Character class definitions from TR 29 +# +############################################################################## !!chain; +!!quoted_literals_only; + # # Character Class Definitions. # -$CR = [\p{Word_Break = CR}]; -$LF = [\p{Word_Break = LF}]; -$Newline = [\p{Word_Break = Newline}]; -$Extend = [\p{Word_Break = Extend}]; -$Format = [\p{Word_Break = Format}]; -$Katakana = [\p{Word_Break = Katakana}]; -$ALetter = [\p{Word_Break = ALetter}]; -$MidNumLet = [\p{Word_Break = MidNumLet}]; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline} ]; +$Extend = [\p{Word_Break = Extend}]; +$ZWJ = [\p{Word_Break = ZWJ}]; +$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; +$ALetter = [\p{Word_Break = ALetter}]; +$Single_Quote = [\p{Word_Break = Single_Quote}]; +$Double_Quote = [\p{Word_Break = Double_Quote}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; # Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks -$Dash = [\N{HYPHEN-MINUS} - \N{HYPHEN} - \N{EN DASH} - \N{MINUS SIGN} - \N{SMALL HYPHEN-MINUS} - \N{FULLWIDTH HYPHEN-MINUS}]; -$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen -$MidNum = [\p{Word_Break = MidNum}]; -$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$Dash = [\N{HYPHEN-MINUS} + \N{HYPHEN} + \N{EN DASH} + \N{MINUS SIGN} + \N{SMALL HYPHEN-MINUS} + \N{FULLWIDTH HYPHEN-MINUS}]; +$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; + +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$WSegSpace = [\p{Word_Break = WSegSpace}]; +$Extended_Pict = [:ExtPict:]; + +$Han = [:Han:]; +$Hiragana = [:Hiragana:]; # Dictionary character set, for triggering language-based break engines. Currently @@ -50,58 +86,83 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; # 5.0 or later as the definition of Complex_Context was corrected to include all # characters requiring dictionary break. -$dictionary = [:LineBreak = Complex_Context:]; -$Control = [\p{Grapheme_Cluster_Break = Control}]; -$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not - # include the dictionary characters. +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$HangulSyllable = [\uac00-\ud7a3]; +$ComplexContext = [:LineBreak = Complex_Context:]; +$KanaKanji = [$Han $Hiragana $Katakana]; +$dictionaryCJK = [$Han $Hiragana $HangulSyllable]; +$dictionary = [$ComplexContext]; + +# leave CJK scripts out of ALetterPlus +$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; + # -# Rules 4 Ignore Format and Extend characters, +# Rules 4 Ignore Format and Extend characters, # except when they appear at the beginning of a region of text. # -$KatakanaEx = $Katakana ($Extend | $Format)*; -$ALetterEx = $ALetterPlus ($Extend | $Format)*; -$MidNumLetEx = $MidNumLet ($Extend | $Format)*; -$MidLetterEx = $MidLetter ($Extend | $Format)*; -$MidNumEx = $MidNum ($Extend | $Format)*; -$NumericEx = $Numeric ($Extend | $Format)*; -$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; +# TODO: check if handling of katakana in dictionary makes rules incorrect/void +$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*; +$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*; +$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*; +$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*; +$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*; +$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*; +$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*; +$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*; +$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*; -$Hiragana = [\p{script=Hiragana}]; $Ideographic = [\p{Ideographic}]; -$HiraganaEx = $Hiragana ($Extend | $Format)*; -$IdeographicEx = $Ideographic ($Extend | $Format)*; +$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*; +$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; ## ------------------------------------------------- -!!forward; - - # Rule 3 - CR x LF # $CR $LF; +# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. +# +$ZWJ $Extended_Pict; + +# Rule 3d - Keep horizontal whitespace together. +# +$WSegSpace $WSegSpace; + # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning # of a region of Text. The rule here comes into play when the start of text # begins with a group of Format chars, or with a "word" consisting of a single # char that is not in any of the listed word break categories followed by -# format char(s). -[^$CR $LF $Newline]? ($Extend | $Format)+; +# format char(s), or is not a CJK dictionary character. +[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+; $NumericEx {100}; $ALetterEx {200}; +$HangulSyllable {200}; +$Hebrew_LetterEx{200}; $KatakanaEx {300}; # note: these status values override those from rule 5 -$HiraganaEx {300}; # by virtual of being numerically larger. +$HiraganaEx {300}; # by virtue of being numerically larger. $IdeographicEx {400}; # +$Extended_Pict ($Extend | $Format | $ZWJ)*; + # # rule 5 # Do not break between most letters. # -$ALetterEx $ALetterEx {200}; +($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; # rule 6 and 7 -$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; +($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200}; + +# rule 7a +$Hebrew_LetterEx $Single_QuoteEx {200}; + +# rule 7b and 7c +$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; # rule 8 @@ -109,27 +170,43 @@ $NumericEx $NumericEx {100}; # rule 9 -$ALetterEx $NumericEx {200}; +($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; # rule 10 -$NumericEx $ALetterEx {200}; +$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; -# rule 11 and 12 +# rule 11 and 12 -$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; +$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; # rule 13 - $KatakanaEx $KatakanaEx {300}; # rule 13a/b -$ALetterEx $ExtendNumLetEx {200}; # (13a) -$NumericEx $ExtendNumLetEx {100}; # (13a) -$KatakanaEx $ExtendNumLetEx {300}; # (13a) -$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) +$ALetterEx $ExtendNumLetEx {200}; # (13a) +$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) +$NumericEx $ExtendNumLetEx {100}; # (13a) +$KatakanaEx $ExtendNumLetEx {300}; # (13a) +$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) + +$ExtendNumLetEx $ALetterEx {200}; # (13b) +$ExtendNumLetEx $Hebrew_Letter {200}; # (13b) +$ExtendNumLetEx $NumericEx {100}; # (13b) +$ExtendNumLetEx $KatakanaEx {300}; # (13b) + +# rules 15 - 17 +# Pairs of Regional Indicators stay together. +# With rule chaining disabled by ^, this rule will match exactly two of them. +# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. +# +^$Regional_IndicatorEx $Regional_IndicatorEx; + +# special handling for CJK characters: chain for later dictionary segmentation +$HangulSyllable $HangulSyllable {200}; + +# Rule 999 +# Match a single code point if no other rule applies. +.; -$ExtendNumLetEx $ALetterEx {200}; # (13b) -$ExtendNumLetEx $NumericEx {100}; # (13b) -$ExtendNumLetEx $KatakanaEx {300}; # (13b) diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java index 98939752cbe..91c3d929d89 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.io.StringReader; @@ -316,6 +317,20 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { new String[] { "", "", "", "" }); } + public void testEmojiFromTheFuture() throws Exception { + // pick an unassigned character with extended_pictographic + int ch = new UnicodeSet("[[:Extended_Pictographic:]&[:Unassigned:]]").getRangeStart(0); + String value = new String(Character.toChars(ch)); + // should analyze to emoji type + BaseTokenStreamTestCase.assertAnalyzesTo(a, value, + new String[] { value }, + new String[] { "" }); + // shouldn't break in a sequence + BaseTokenStreamTestCase.assertAnalyzesTo(a, value + '\u200D' + value, + new String[] { value + '\u200D' + value }, + new String[] { "" }); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index 49a119096fa..4847f5d9a7c 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -30,7 +30,7 @@ com.fasterxml.jackson.core.version = 2.9.5 /com.googlecode.mp4parser/isoparser = 1.1.18 /com.healthmarketscience.jackcess/jackcess = 2.1.8 /com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4 -/com.ibm.icu/icu4j = 61.1 +/com.ibm.icu/icu4j = 62.1 /com.lmax/disruptor = 3.4.0 /com.pff/java-libpst = 0.8.1 diff --git a/lucene/licenses/icu4j-61.1.jar.sha1 b/lucene/licenses/icu4j-61.1.jar.sha1 deleted file mode 100644 index bde7409baf4..00000000000 --- a/lucene/licenses/icu4j-61.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -28d33b5e44e72edcc66a5da7a34a42147f38d987 diff --git a/lucene/licenses/icu4j-62.1.jar.sha1 b/lucene/licenses/icu4j-62.1.jar.sha1 new file mode 100644 index 00000000000..20fa5c7528d --- /dev/null +++ b/lucene/licenses/icu4j-62.1.jar.sha1 @@ -0,0 +1 @@ +7a4d00d5ec5febd252a6182e8b6e87a0a9821f81 diff --git a/solr/licenses/icu4j-61.1.jar.sha1 b/solr/licenses/icu4j-61.1.jar.sha1 deleted file mode 100644 index bde7409baf4..00000000000 --- a/solr/licenses/icu4j-61.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -28d33b5e44e72edcc66a5da7a34a42147f38d987 diff --git a/solr/licenses/icu4j-62.1.jar.sha1 b/solr/licenses/icu4j-62.1.jar.sha1 new file mode 100644 index 00000000000..20fa5c7528d --- /dev/null +++ b/solr/licenses/icu4j-62.1.jar.sha1 @@ -0,0 +1 @@ +7a4d00d5ec5febd252a6182e8b6e87a0a9821f81