mirror of https://github.com/apache/lucene.git
LUCENE-8366: Upgrade to ICU 62.1
This commit is contained in:
parent
5cd8aa4ff9
commit
2ea416ee3d
|
@ -123,6 +123,10 @@ Changes in Runtime Behavior:
|
|||
* LUCENE-7976: TieredMergePolicy now respects maxSegmentSizeMB by default when executing
|
||||
findForcedMerges and findForcedDeletesMerges (Erick Erickson)
|
||||
|
||||
Other:
|
||||
|
||||
* LUCENE-8366: Upgrade to ICU 62.1. Emoji handling now uses Unicode 11's
|
||||
Extended_Pictographic property. (Robert Muir)
|
||||
|
||||
======================= Lucene 7.4.1 =======================
|
||||
|
||||
|
|
|
@ -24,15 +24,15 @@ import org.apache.lucene.util.SparseFixedBitSet;
|
|||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was created using ICU4J v61.1.0.0
|
||||
* The data was created using ICU4J v62.1.0.0
|
||||
* <p>
|
||||
* Unicode version: 10.0.0.0
|
||||
* Unicode version: 11.0.0.0
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "10.0.0.0";
|
||||
public static final String UNICODE_VERSION = "11.0.0.0";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(
|
||||
|
|
|
@ -63,14 +63,10 @@ $MidNumLet = [\p{Word_Break = MidNumLet}];
|
|||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$E_Base = [\p{Word_Break = EB}];
|
||||
$E_Modifier = [\p{Word_Break = EM}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
$EBG = [\p{Word_Break = EBG}];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [:ExtPict:];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
@ -115,17 +111,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
|||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ ($Extended_Pict | $EmojiNRK);
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text. The rule here comes into play when the start of text
|
||||
|
@ -142,8 +138,6 @@ $KatakanaEx {300}; # note: these status values override those from rule 5
|
|||
$HiraganaEx {300}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
$E_Base ($Extend | $Format | $ZWJ)*;
|
||||
$E_Modifier ($Extend | $Format | $ZWJ)*;
|
||||
$Extended_Pict ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
|
@ -193,11 +187,6 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
|
|||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
|
||||
# rule 14
|
||||
# Do not break within emoji modifier sequences
|
||||
|
||||
($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
|
@ -211,24 +200,3 @@ $HangulSyllable $HangulSyllable {200};
|
|||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# rule 3
|
||||
($Extend | $Format | $ZWJ)+ .?;
|
||||
|
||||
# rule 6
|
||||
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
|
||||
|
||||
# rule 7b
|
||||
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
|
||||
|
||||
|
||||
# rule 11
|
||||
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
|
||||
|
||||
# rule 13c
|
||||
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
||||
|
|
|
@ -159,6 +159,8 @@ FF9E..FF9F>
|
|||
FFE3>
|
||||
102E0>
|
||||
10AE5..10AE6>
|
||||
10D22..10D27>
|
||||
10F46..10F50>
|
||||
110B9..110BA>
|
||||
11133..11134>
|
||||
11173>
|
||||
|
@ -177,12 +179,14 @@ FFE3>
|
|||
1163F>
|
||||
116B6..116B7>
|
||||
1172B>
|
||||
11839..1183A>
|
||||
11A34>
|
||||
11A47>
|
||||
11A99>
|
||||
11C3F>
|
||||
11D42>
|
||||
11D44..11D45>
|
||||
11D97>
|
||||
16AF0..16AF4>
|
||||
16F8F..16F9F>
|
||||
1D167..1D169>
|
||||
|
|
|
@ -442,6 +442,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
10A41>0032 # KHAROSHTHI DIGIT TWO
|
||||
10A42>0033 # KHAROSHTHI DIGIT THREE
|
||||
10A43>0034 # KHAROSHTHI DIGIT FOUR
|
||||
10D30>0030 # HANIFI ROHINGYA DIGIT ZERO
|
||||
10D31>0031 # HANIFI ROHINGYA DIGIT ONE
|
||||
10D32>0032 # HANIFI ROHINGYA DIGIT TWO
|
||||
10D33>0033 # HANIFI ROHINGYA DIGIT THREE
|
||||
10D34>0034 # HANIFI ROHINGYA DIGIT FOUR
|
||||
10D35>0035 # HANIFI ROHINGYA DIGIT FIVE
|
||||
10D36>0036 # HANIFI ROHINGYA DIGIT SIX
|
||||
10D37>0037 # HANIFI ROHINGYA DIGIT SEVEN
|
||||
10D38>0038 # HANIFI ROHINGYA DIGIT EIGHT
|
||||
10D39>0039 # HANIFI ROHINGYA DIGIT NINE
|
||||
10E60>0031 # RUMI DIGIT ONE
|
||||
10E61>0032 # RUMI DIGIT TWO
|
||||
10E62>0033 # RUMI DIGIT THREE
|
||||
|
@ -590,6 +600,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
11D57>0037 # MASARAM GONDI DIGIT SEVEN
|
||||
11D58>0038 # MASARAM GONDI DIGIT EIGHT
|
||||
11D59>0039 # MASARAM GONDI DIGIT NINE
|
||||
11DA0>0030 # GUNJALA GONDI DIGIT ZERO
|
||||
11DA1>0031 # GUNJALA GONDI DIGIT ONE
|
||||
11DA2>0032 # GUNJALA GONDI DIGIT TWO
|
||||
11DA3>0033 # GUNJALA GONDI DIGIT THREE
|
||||
11DA4>0034 # GUNJALA GONDI DIGIT FOUR
|
||||
11DA5>0035 # GUNJALA GONDI DIGIT FIVE
|
||||
11DA6>0036 # GUNJALA GONDI DIGIT SIX
|
||||
11DA7>0037 # GUNJALA GONDI DIGIT SEVEN
|
||||
11DA8>0038 # GUNJALA GONDI DIGIT EIGHT
|
||||
11DA9>0039 # GUNJALA GONDI DIGIT NINE
|
||||
16A60>0030 # MRO DIGIT ZERO
|
||||
16A61>0031 # MRO DIGIT ONE
|
||||
16A62>0032 # MRO DIGIT TWO
|
||||
|
|
|
@ -16,8 +16,6 @@
|
|||
*/
|
||||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
@ -65,18 +63,18 @@ final class BreakIteratorWrapper {
|
|||
}
|
||||
}
|
||||
|
||||
// See unicode doc L2/16-315 and also the RBBI rules for rationale.
|
||||
// we don't include regional indicators here, because they aren't ambiguous for tagging,
|
||||
// they need only be treated special for segmentation.
|
||||
// See unicode doc L2/16-315 for rationale.
|
||||
// basically for us the ambiguous cases (keycap/etc) as far as types go.
|
||||
static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
|
||||
// faster than doing hasBinaryProperty() checks, at the cost of 1KB ram
|
||||
static final UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").freeze();
|
||||
|
||||
/** Returns true if the current text represents emoji character or sequence */
|
||||
private boolean isEmoji(int current, int next) {
|
||||
int begin = start + current;
|
||||
int end = start + next;
|
||||
int codepoint = UTF16.charAt(text, 0, end, begin);
|
||||
// TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
|
||||
if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
|
||||
if (EMOJI.contains(codepoint)) {
|
||||
if (EMOJI_RK.contains(codepoint)) {
|
||||
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
|
||||
// an emoji presentation selector or keycap follows.
|
||||
|
|
|
@ -353,7 +353,7 @@ and
|
|||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
the most recent version of Unicode (currently 10.0). However, some users who wish
|
||||
the most recent version of Unicode (currently 11.0). However, some users who wish
|
||||
for stronger backwards compatibility can restrict
|
||||
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
||||
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -14,11 +14,36 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Based on Default.rbbi, the default RBBI rules, based on UAX#29.
|
||||
# Added dashes to $MidLetter, so that words aren't broken on single dashes.
|
||||
# This test is a modified Default.rbbi that adds hyphens to MidLetter
|
||||
#
|
||||
# Default.rbbi is from ICU (with some small modifications, to avoid CJK dictionary break,
|
||||
# and status code change related to that)
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
|
@ -26,11 +51,16 @@
|
|||
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Newline = [\p{Word_Break = Newline} ];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$ZWJ = [\p{Word_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks
|
||||
$Dash = [\N{HYPHEN-MINUS}
|
||||
|
@ -42,7 +72,13 @@ $Dash = [\N{HYPHEN-MINUS}
|
|||
$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$WSegSpace = [\p{Word_Break = WSegSpace}];
|
||||
$Extended_Pict = [:ExtPict:];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
|
@ -50,58 +86,83 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
||||
# include the dictionary characters.
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
|
||||
$dictionary = [$ComplexContext];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
$Hiragana = [\p{script=Hiragana}];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!forward;
|
||||
|
||||
|
||||
# Rule 3 - CR x LF
|
||||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ $Extended_Pict;
|
||||
|
||||
# Rule 3d - Keep horizontal whitespace together.
|
||||
#
|
||||
$WSegSpace $WSegSpace;
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s).
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_LetterEx{200};
|
||||
$KatakanaEx {300}; # note: these status values override those from rule 5
|
||||
$HiraganaEx {300}; # by virtual of being numerically larger.
|
||||
$HiraganaEx {300}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
$Extended_Pict ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
$ALetterEx $ALetterEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 6 and 7
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_LetterEx $Single_QuoteEx {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
|
||||
|
||||
# rule 8
|
||||
|
||||
|
@ -109,27 +170,43 @@ $NumericEx $NumericEx {100};
|
|||
|
||||
# rule 9
|
||||
|
||||
$ALetterEx $NumericEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx $ALetterEx {200};
|
||||
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx $KatakanaEx {300};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||
|
||||
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||
$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
|
||||
# rules 15 - 17
|
||||
# Pairs of Regional Indicators stay together.
|
||||
# With rule chaining disabled by ^, this rule will match exactly two of them.
|
||||
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
|
||||
#
|
||||
^$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
||||
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -316,6 +317,20 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
|
||||
}
|
||||
|
||||
public void testEmojiFromTheFuture() throws Exception {
|
||||
// pick an unassigned character with extended_pictographic
|
||||
int ch = new UnicodeSet("[[:Extended_Pictographic:]&[:Unassigned:]]").getRangeStart(0);
|
||||
String value = new String(Character.toChars(ch));
|
||||
// should analyze to emoji type
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, value,
|
||||
new String[] { value },
|
||||
new String[] { "<EMOJI>" });
|
||||
// shouldn't break in a sequence
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, value + '\u200D' + value,
|
||||
new String[] { value + '\u200D' + value },
|
||||
new String[] { "<EMOJI>" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||
|
|
|
@ -30,7 +30,7 @@ com.fasterxml.jackson.core.version = 2.9.5
|
|||
/com.googlecode.mp4parser/isoparser = 1.1.18
|
||||
/com.healthmarketscience.jackcess/jackcess = 2.1.8
|
||||
/com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
|
||||
/com.ibm.icu/icu4j = 61.1
|
||||
/com.ibm.icu/icu4j = 62.1
|
||||
/com.lmax/disruptor = 3.4.0
|
||||
/com.pff/java-libpst = 0.8.1
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
28d33b5e44e72edcc66a5da7a34a42147f38d987
|
|
@ -0,0 +1 @@
|
|||
7a4d00d5ec5febd252a6182e8b6e87a0a9821f81
|
|
@ -1 +0,0 @@
|
|||
28d33b5e44e72edcc66a5da7a34a42147f38d987
|
|
@ -0,0 +1 @@
|
|||
7a4d00d5ec5febd252a6182e8b6e87a0a9821f81
|
Loading…
Reference in New Issue