LUCENE-8366: Upgrade to ICU 62.1

This commit is contained in:
Robert Muir 2018-06-21 20:08:17 -04:00
parent 5cd8aa4ff9
commit 2ea416ee3d
17 changed files with 194 additions and 108 deletions

View File

@ -123,6 +123,10 @@ Changes in Runtime Behavior:
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
findForcedMerges and findForcedDeletesMerges (Erick Erickson)
Other:
* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's
Extended_Pictographic property. (Robert Muir)
======================= Lucene 7.4.1 =======================

View File

@ -24,15 +24,15 @@ import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was created using ICU4J v61.1.0.0
* The data was created using ICU4J v62.1.0.0
* <p>
* Unicode version: 10.0.0.0
* Unicode version: 11.0.0.0
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "10.0.0.0";
public static final String UNICODE_VERSION = "11.0.0.0";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(

View File

@ -63,14 +63,10 @@ $MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$E_Base = [\p{Word_Break = EB}];
$E_Modifier = [\p{Word_Break = EM}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$WSegSpace = [\p{Word_Break = WSegSpace}];
$Extended_Pict = [:ExtPict:];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
@ -115,17 +111,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
!!forward;
# Rule 3 - CR x LF
#
$CR $LF;
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
$ZWJ ($Extended_Pict | $EmojiNRK);
$ZWJ $Extended_Pict;
# Rule 3d - Keep horizontal whitespace together.
#
$WSegSpace $WSegSpace;
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
@ -142,8 +138,6 @@ $KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
$Extended_Pict ($Extend | $Format | $ZWJ)*;
#
@ -193,11 +187,6 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
# rule 14
# Do not break within emoji modifier sequences
($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
# rules 15 - 17
# Pairs of Regional Indicators stay together.
# With rule chaining disabled by ^, this rule will match exactly two of them.
@ -211,24 +200,3 @@ $HangulSyllable $HangulSyllable {200};
# Rule 999
# Match a single code point if no other rule applies.
.;
## -------------------------------------------------
!!safe_reverse;
# rule 3
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
# rule 7b
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
# rule 13c
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;

View File

@ -159,6 +159,8 @@ FF9E..FF9F>
FFE3>
102E0>
10AE5..10AE6>
10D22..10D27>
10F46..10F50>
110B9..110BA>
11133..11134>
11173>
@ -177,12 +179,14 @@ FFE3>
1163F>
116B6..116B7>
1172B>
11839..1183A>
11A34>
11A47>
11A99>
11C3F>
11D42>
11D44..11D45>
11D97>
16AF0..16AF4>
16F8F..16F9F>
1D167..1D169>

View File

@ -442,6 +442,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
10A41>0032 # KHAROSHTHI DIGIT TWO
10A42>0033 # KHAROSHTHI DIGIT THREE
10A43>0034 # KHAROSHTHI DIGIT FOUR
10D30>0030 # HANIFI ROHINGYA DIGIT ZERO
10D31>0031 # HANIFI ROHINGYA DIGIT ONE
10D32>0032 # HANIFI ROHINGYA DIGIT TWO
10D33>0033 # HANIFI ROHINGYA DIGIT THREE
10D34>0034 # HANIFI ROHINGYA DIGIT FOUR
10D35>0035 # HANIFI ROHINGYA DIGIT FIVE
10D36>0036 # HANIFI ROHINGYA DIGIT SIX
10D37>0037 # HANIFI ROHINGYA DIGIT SEVEN
10D38>0038 # HANIFI ROHINGYA DIGIT EIGHT
10D39>0039 # HANIFI ROHINGYA DIGIT NINE
10E60>0031 # RUMI DIGIT ONE
10E61>0032 # RUMI DIGIT TWO
10E62>0033 # RUMI DIGIT THREE
@ -590,6 +600,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
11D57>0037 # MASARAM GONDI DIGIT SEVEN
11D58>0038 # MASARAM GONDI DIGIT EIGHT
11D59>0039 # MASARAM GONDI DIGIT NINE
11DA0>0030 # GUNJALA GONDI DIGIT ZERO
11DA1>0031 # GUNJALA GONDI DIGIT ONE
11DA2>0032 # GUNJALA GONDI DIGIT TWO
11DA3>0033 # GUNJALA GONDI DIGIT THREE
11DA4>0034 # GUNJALA GONDI DIGIT FOUR
11DA5>0035 # GUNJALA GONDI DIGIT FIVE
11DA6>0036 # GUNJALA GONDI DIGIT SIX
11DA7>0037 # GUNJALA GONDI DIGIT SEVEN
11DA8>0038 # GUNJALA GONDI DIGIT EIGHT
11DA9>0039 # GUNJALA GONDI DIGIT NINE
16A60>0030 # MRO DIGIT ZERO
16A61>0031 # MRO DIGIT ONE
16A62>0032 # MRO DIGIT TWO

View File

@ -16,8 +16,6 @@
*/
package org.apache.lucene.analysis.icu.segmentation;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
@ -65,18 +63,18 @@ final class BreakIteratorWrapper {
}
}
// See unicode doc L2/16-315 and also the RBBI rules for rationale.
// we don't include regional indicators here, because they aren't ambiguous for tagging,
// they need only be treated special for segmentation.
// See unicode doc L2/16-315 for rationale.
// basically for us the ambiguous cases (keycap/etc) as far as types go.
static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
// faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze();
/** Returns true if the current text represents emoji character or sequence */
private boolean isEmoji(int current, int next) {
int begin = start + current;
int end = start + next;
int codepoint = UTF16.charAt(text, 0, end, begin);
// TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
if (EMOJI.contains(codepoint)) {
if (EMOJI_RK.contains(codepoint)) {
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
// an emoji presentation selector or keycap follows.

View File

@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
the most recent version of Unicode (currently 10.0). However, some users who wish
the most recent version of Unicode (currently 11.0). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.

View File

@ -14,35 +14,71 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
# Added dashes to $MidLetter, so that words aren't broken on single dashes.
# This test is a modified Default.rbbi that adds hyphens to MidLetter
#
# Default.rbbi is from ICU (with some small modifications, to avoid CJK dictionary break,
# and status code change related to that)
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
# with additions for Emoji Sequences from https://goo.gl/cluFCn
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
##############################################################################
#
# Character class definitions from TR 29
#
##############################################################################
!!chain;
!!quoted_literals_only;
#
# Character Class Definitions.
#
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline} ];
$Extend = [\p{Word_Break = Extend}];
$ZWJ = [\p{Word_Break = ZWJ}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
$Single_Quote = [\p{Word_Break = Single_Quote}];
$Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
$Dash = [\N{HYPHEN-MINUS}
\N{HYPHEN}
\N{EN DASH}
\N{MINUS SIGN}
\N{SMALL HYPHEN-MINUS}
\N{FULLWIDTH HYPHEN-MINUS}];
$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$Dash = [\N{HYPHEN-MINUS}
\N{HYPHEN}
\N{EN DASH}
\N{MINUS SIGN}
\N{SMALL HYPHEN-MINUS}
\N{FULLWIDTH HYPHEN-MINUS}];
$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$WSegSpace = [\p{Word_Break = WSegSpace}];
$Extended_Pict = [:ExtPict:];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
# Dictionary character set, for triggering language-based break engines. Currently
@ -50,58 +86,83 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
# include the dictionary characters.
$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
$dictionary = [$ComplexContext];
# leave CJK scripts out of ALetterPlus
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
$KatakanaEx = $Katakana ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Hiragana = [\p{script=Hiragana}];
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
!!forward;
# Rule 3 - CR x LF
#
$CR $LF;
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
$ZWJ $Extended_Pict;
# Rule 3d - Keep horizontal whitespace together.
#
$WSegSpace $WSegSpace;
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s).
[^$CR $LF $Newline]? ($Extend | $Format)+;
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
$HangulSyllable {200};
$Hebrew_LetterEx{200};
$KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtual of being numerically larger.
$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
$Extended_Pict ($Extend | $Format | $ZWJ)*;
#
# rule 5
# Do not break between most letters.
#
$ALetterEx $ALetterEx {200};
($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 6 and 7
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 7a
$Hebrew_LetterEx $Single_QuoteEx {200};
# rule 7b and 7c
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
# rule 8
@ -109,27 +170,43 @@ $NumericEx $NumericEx {100};
# rule 9
$ALetterEx $NumericEx {200};
($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
# rule 10
$NumericEx $ALetterEx {200};
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
# rule 13
$KatakanaEx $KatakanaEx {300};
# rule 13a/b
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
# rules 15 - 17
# Pairs of Regional Indicators stay together.
# With rule chaining disabled by ^, this rule will match exactly two of them.
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
# Rule 999
# Match a single code point if no other rule applies.
.;
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import java.io.IOException;
import java.io.StringReader;
@ -316,6 +317,20 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
}
public void testEmojiFromTheFuture() throws Exception {
// pick an unassigned character with extended_pictographic
int ch = new UnicodeSet("[[:Extended_Pictographic:]&[:Unassigned:]]").getRangeStart(0);
String value = new String(Character.toChars(ch));
// should analyze to emoji type
BaseTokenStreamTestCase.assertAnalyzesTo(a, value,
new String[] { value },
new String[] { "<EMOJI>" });
// shouldn't break in a sequence
BaseTokenStreamTestCase.assertAnalyzesTo(a, value + '\u200D' + value,
new String[] { value + '\u200D' + value },
new String[] { "<EMOJI>" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

View File

@ -30,7 +30,7 @@ com.fasterxml.jackson.core.version = 2.9.5
/com.googlecode.mp4parser/isoparser = 1.1.18
/com.healthmarketscience.jackcess/jackcess = 2.1.8
/com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
/com.ibm.icu/icu4j = 61.1
/com.ibm.icu/icu4j = 62.1
/com.lmax/disruptor = 3.4.0
/com.pff/java-libpst = 0.8.1

View File

@ -1 +0,0 @@
28d33b5e44e72edcc66a5da7a34a42147f38d987

View File

@ -0,0 +1 @@
7a4d00d5ec5febd252a6182e8b6e87a0a9821f81

View File

@ -1 +0,0 @@
28d33b5e44e72edcc66a5da7a34a42147f38d987

View File

@ -0,0 +1 @@
7a4d00d5ec5febd252a6182e8b6e87a0a9821f81