LUCENE-8122: Upgrade analysis/icu to ICU 60.2

This commit is contained in:
Robert Muir 2018-01-08 16:33:38 -05:00
parent a3a0e0b11e
commit 07407a5b53
18 changed files with 128 additions and 40 deletions

View File

@ -132,6 +132,8 @@ Other
* LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.
(Kai Chan via Adrien Grand)
* LUCENE-8122: Upgrade analysis/icu to ICU 60.2. (Robert Muir)
======================= Lucene 7.2.0 =======================
API Changes

View File

@ -14,16 +14,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
# This file is from ICU (with some small modifications, to avoid CJK dictionary break,
# and status code change related to that)
#
# Copyright (C) 2002-2013, International Business Machines Corporation
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 2002-2016, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
# with additions for Emoji Sequences from https://goo.gl/cluFCn
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
@ -35,6 +40,7 @@
##############################################################################
!!chain;
!!quoted_literals_only;
#
@ -43,8 +49,9 @@
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Newline = [\p{Word_Break = Newline} ];
$Extend = [\p{Word_Break = Extend}];
$ZWJ = [\p{Word_Break = ZWJ}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
@ -57,6 +64,13 @@ $MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$E_Base = [\p{Word_Break = EB}];
$E_Modifier = [\p{Word_Break = EM}];
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
$Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
$EBG = [\p{Word_Break = EBG}];
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
@ -83,21 +97,21 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
## -------------------------------------------------
@ -108,12 +122,17 @@ $IdeographicEx = $Ideographic ($Extend | $Format)*;
#
$CR $LF;
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
#
$ZWJ ($Extended_Pict | $EmojiNRK);
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
$NumericEx {100};
$ALetterEx {200};
@ -123,6 +142,10 @@ $KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
$E_Base ($Extend | $Format | $ZWJ)*;
$E_Modifier ($Extend | $Format | $ZWJ)*;
$Extended_Pict ($Extend | $Format | $ZWJ)*;
#
# rule 5
# Do not break between most letters.
@ -170,9 +193,42 @@ $ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
# rule 13c
# rule 14
# Do not break within emoji modifier sequences
$Regional_IndicatorEx $Regional_IndicatorEx;
($E_Base | $EBG) ($Format | $Extend | $ZWJ)* $E_Modifier;
# rules 15 - 17
# Pairs of Regional Indicators stay together.
# With rule chaining disabled by ^, this rule will match exactly two of them.
# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
#
^$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
# Rule 999
# Match a single code point if no other rule applies.
.;
## -------------------------------------------------
!!safe_reverse;
# rule 3
($Extend | $Format | $ZWJ)+ .?;
# rule 6
($MidLetter | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* ($Hebrew_Letter | $ALetterPlus);
# rule 7b
$Double_Quote ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
# rule 11
($MidNum | $MidNumLet | $Single_Quote) ($Format | $Extend | $ZWJ)* $Numeric;
# rule 13c
$Regional_Indicator ($Format | $Extend | $ZWJ)* $Regional_Indicator;

View File

@ -73,12 +73,14 @@
0A4D>
0ABC>
0ACD>
0AFD..0AFF>
0B3C>
0B4D>
0BCD>
0C4D>
0CBC>
0CCD>
0D3B..0D3C>
0D4D>
0DCA>
0E47..0E4C>
@ -112,10 +114,10 @@
1CD0..1CE8>
1CED>
1CF4>
1CF8..1CF9>
1CF7..1CF9>
1D2C..1D6A>
1DC4..1DCF>
1DF5>
1DF5..1DF9>
1DFD..1DFF>
1FBD>
1FBF..1FC1>
@ -175,7 +177,12 @@ FFE3>
1163F>
116B6..116B7>
1172B>
11A34>
11A47>
11A99>
11C3F>
11D42>
11D44..11D45>
16AF0..16AF4>
16F8F..16F9F>
1D167..1D169>

View File

@ -580,6 +580,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
11C57>0037 # BHAIKSUKI DIGIT SEVEN
11C58>0038 # BHAIKSUKI DIGIT EIGHT
11C59>0039 # BHAIKSUKI DIGIT NINE
11D50>0030 # MASARAM GONDI DIGIT ZERO
11D51>0031 # MASARAM GONDI DIGIT ONE
11D52>0032 # MASARAM GONDI DIGIT TWO
11D53>0033 # MASARAM GONDI DIGIT THREE
11D54>0034 # MASARAM GONDI DIGIT FOUR
11D55>0035 # MASARAM GONDI DIGIT FIVE
11D56>0036 # MASARAM GONDI DIGIT SIX
11D57>0037 # MASARAM GONDI DIGIT SEVEN
11D58>0038 # MASARAM GONDI DIGIT EIGHT
11D59>0039 # MASARAM GONDI DIGIT NINE
16A60>0030 # MRO DIGIT ZERO
16A61>0031 # MRO DIGIT ONE
16A62>0032 # MRO DIGIT TWO

View File

@ -1,3 +1,5 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
@ -7,7 +9,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 9.0.0
* Unicode 10.0.0
# Canonical_Combining_Class (ccc) values
0300..0314:230
@ -164,6 +166,7 @@
0C56:91
0CBC:7
0CCD:9
0D3B..0D3C:9
0D4D:9
0DCA:9
0E38..0E39:103
@ -234,6 +237,9 @@
1DCF:220
1DD0:202
1DD1..1DF5:230
1DF6:232
1DF7..1DF8:228
1DF9:220
1DFB:230
1DFC:233
1DFD:220
@ -322,7 +328,12 @@ FE2E..FE2F:230
116B6:9
116B7:7
1172B:9
11A34:9
11A47:9
11A99:9
11C3F:9
11D42:7
11D44..11D45:9
16AF0..16AF4:1
16B30..16B36:230
1BC9E:1

View File

@ -1,3 +1,5 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
@ -11,7 +13,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 9.0.0
* Unicode 10.0.0
00A0>0020
00A8>0020 0308

View File

@ -1,7 +1,7 @@
# Unicode Character Database
# Copyright (c) 1991-2016 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (C) 1999-2016, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc_cf.txt
#
@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 9.0.0
* Unicode 10.0.0
0041>0061
0042>0062

View File

@ -200,18 +200,18 @@ public final class ICUTokenizer extends Tokenizer {
*/
private boolean incrementTokenBuffer() {
int start = breaker.current();
if (start == BreakIterator.DONE)
return false; // BreakIterator exhausted
assert start != BreakIterator.DONE;
// find the next set of boundaries, skipping over non-tokens (rule status 0)
int end = breaker.next();
while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
while (end != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
start = end;
end = breaker.next();
}
if (start == BreakIterator.DONE)
if (end == BreakIterator.DONE) {
return false; // BreakIterator exhausted
}
termAtt.copyBuffer(buffer, start, end - start);
offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));

View File

@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
the most recent version of Unicode (currently 8.0). However, some users who wish
the most recent version of Unicode (currently 10.0). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.

View File

@ -62,9 +62,9 @@ import java.util.regex.Pattern;
*/
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-58-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
= "http://source.icu-project.org/repos/icu/tags";
private static final String ICU_RELEASE_TAG = "release-60-2";
private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
private static final String NFKC_CF_TXT = "nfkc_cf.txt";

View File

@ -31,7 +31,7 @@ com.fasterxml.jackson.core.version = 2.5.4
/com.googlecode.mp4parser/isoparser = 1.1.18
/com.healthmarketscience.jackcess/jackcess = 2.1.8
/com.healthmarketscience.jackcess/jackcess-encrypt = 2.1.4
/com.ibm.icu/icu4j = 59.1
/com.ibm.icu/icu4j = 60.2
/com.pff/java-libpst = 0.8.1
com.rometools.version = 1.5.1

View File

@ -1 +0,0 @@
6f06e820cf4c8968bbbaae66ae0b33f6a256b57f

View File

@ -0,0 +1 @@
e452cba3caaf93b997ff543c7246a6da74ed70f1

View File

@ -1 +0,0 @@
6f06e820cf4c8968bbbaae66ae0b33f6a256b57f

View File

@ -0,0 +1 @@
e452cba3caaf93b997ff543c7246a6da74ed70f1