mirror of https://github.com/apache/lucene.git
LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.
This commit is contained in:
parent
31437c9b43
commit
b0a43aa1b2
|
@ -127,6 +127,10 @@ Tests
|
|||
expression to encapsulate a statement that is expected to throw an exception.
|
||||
(Ryan Ernst)
|
||||
|
||||
Other
|
||||
|
||||
* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
|
||||
|
||||
======================= Lucene 5.6.0 =======================
|
||||
(No Changes)
|
||||
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# Parses Khmer text, with orthographic syllable as token.
|
||||
#
|
||||
# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
|
||||
#
|
||||
# B = base character (consonant, independent vowel, etc)
|
||||
$KhmerBase = [\u1780-\u17B3];
|
||||
# R = robat
|
||||
$KhmerRobat = [\u17CC];
|
||||
# C = consonant shifter
|
||||
$KhmerShifter = [\u17C9\u17CA];
|
||||
# S = subscript consonant or independent vowel sign
|
||||
$KhmerSub = ([\u17D2] $KhmerBase);
|
||||
# V = dependent vowel sign
|
||||
$KhmerVowel = [\u17B4-\u17C5];
|
||||
# Z = zero-width joiner or non-joiner
|
||||
$KhmerZWC = [\u200C\u200D];
|
||||
# O = any other sign
|
||||
$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
|
||||
|
||||
$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
$KhmerJoinedSyllableEx {200};
|
||||
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -62,7 +62,7 @@
|
|||
07A6..07B0>
|
||||
07EB..07F5>
|
||||
0818..0819>
|
||||
08E4..08FE>
|
||||
08E3..08FE>
|
||||
093C>
|
||||
094D>
|
||||
0951..0954>
|
||||
|
@ -149,7 +149,7 @@ AAF6>
|
|||
AB5B..AB5F>
|
||||
ABEC..ABED>
|
||||
FB1E>
|
||||
FE20..FE2D>
|
||||
FE20..FE2F>
|
||||
FF3E>
|
||||
FF40>
|
||||
FF70>
|
||||
|
@ -161,6 +161,7 @@ FFE3>
|
|||
11133..11134>
|
||||
11173>
|
||||
111C0>
|
||||
111CA..111CC>
|
||||
11235..11236>
|
||||
112E9..112EA>
|
||||
1133C>
|
||||
|
@ -171,6 +172,7 @@ FFE3>
|
|||
115BF..115C0>
|
||||
1163F>
|
||||
116B6..116B7>
|
||||
1172B>
|
||||
16AF0..16AF4>
|
||||
16F8F..16F9F>
|
||||
1D167..1D169>
|
||||
|
|
|
@ -540,6 +540,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
|||
116C7>0037 # TAKRI DIGIT SEVEN
|
||||
116C8>0038 # TAKRI DIGIT EIGHT
|
||||
116C9>0039 # TAKRI DIGIT NINE
|
||||
11730>0030 # AHOM DIGIT ZERO
|
||||
11731>0031 # AHOM DIGIT ONE
|
||||
11732>0032 # AHOM DIGIT TWO
|
||||
11733>0033 # AHOM DIGIT THREE
|
||||
11734>0034 # AHOM DIGIT FOUR
|
||||
11735>0035 # AHOM DIGIT FIVE
|
||||
11736>0036 # AHOM DIGIT SIX
|
||||
11737>0037 # AHOM DIGIT SEVEN
|
||||
11738>0038 # AHOM DIGIT EIGHT
|
||||
11739>0039 # AHOM DIGIT NINE
|
||||
118E0>0030 # WARANG CITI DIGIT ZERO
|
||||
118E1>0031 # WARANG CITI DIGIT ONE
|
||||
118E2>0032 # WARANG CITI DIGIT TWO
|
||||
|
|
|
@ -35,9 +35,7 @@ import com.ibm.icu.util.ULocale;
|
|||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||
* but with the following tailorings:
|
||||
* <ul>
|
||||
* <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary.
|
||||
* <li>Khmer text is broken into syllables
|
||||
* based on custom BreakIterator rules.
|
||||
* <li>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -65,8 +63,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
// the same as ROOT, except no dictionary segmentation for cjk
|
||||
private static final BreakIterator defaultBreakIterator =
|
||||
readBreakIterator("Default.brk");
|
||||
private static final BreakIterator khmerBreakIterator =
|
||||
readBreakIterator("Khmer.brk");
|
||||
|
||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||
private final boolean cjkAsWords;
|
||||
|
@ -91,7 +87,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
switch(script) {
|
||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||
}
|
||||
|
|
|
@ -353,7 +353,7 @@ and
|
|||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
the most recent version of Unicode (currently 6.3). However, some users who wish
|
||||
the most recent version of Unicode (currently 8.0). However, some users who wish
|
||||
for stronger backwards compatibility can restrict
|
||||
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
||||
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -129,6 +129,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||
}
|
||||
|
||||
public void testKhmer() throws Exception {
|
||||
assertAnalyzesTo(a, "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ", new String[] { "ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", "ខ្នង", "នេះ" });
|
||||
}
|
||||
public void testLao() throws Exception {
|
||||
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
||||
|
|
|
@ -39,7 +39,7 @@ com.google.inject.guice.version = 3.0
|
|||
/com.google.protobuf/protobuf-java = 2.5.0
|
||||
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
||||
/com.googlecode.mp4parser/isoparser = 1.0.2
|
||||
/com.ibm.icu/icu4j = 54.1
|
||||
/com.ibm.icu/icu4j = 56.1
|
||||
/com.pff/java-libpst = 0.8.1
|
||||
/com.spatial4j/spatial4j = 0.5
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
3f66ecd5871467598bc81662817b80612a0a907f
|
|
@ -0,0 +1 @@
|
|||
8dd6671f52165a0419e6de5e1016400875a90fa9
|
|
@ -1 +0,0 @@
|
|||
3f66ecd5871467598bc81662817b80612a0a907f
|
|
@ -0,0 +1 @@
|
|||
8dd6671f52165a0419e6de5e1016400875a90fa9
|
Loading…
Reference in New Issue