mirror of https://github.com/apache/lucene.git
LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.
This commit is contained in:
parent
31437c9b43
commit
b0a43aa1b2
|
@ -127,6 +127,10 @@ Tests
|
||||||
expression to encapsulate a statement that is expected to throw an exception.
|
expression to encapsulate a statement that is expected to throw an exception.
|
||||||
(Ryan Ernst)
|
(Ryan Ernst)
|
||||||
|
|
||||||
|
Other
|
||||||
|
|
||||||
|
* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 5.6.0 =======================
|
======================= Lucene 5.6.0 =======================
|
||||||
(No Changes)
|
(No Changes)
|
||||||
|
|
||||||
|
|
|
@ -1,61 +0,0 @@
|
||||||
#
|
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
# contributor license agreements. See the NOTICE file distributed with
|
|
||||||
# this work for additional information regarding copyright ownership.
|
|
||||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
# (the "License"); you may not use this file except in compliance with
|
|
||||||
# the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
#
|
|
||||||
# Parses Khmer text, with orthographic syllable as token.
|
|
||||||
#
|
|
||||||
# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
|
|
||||||
#
|
|
||||||
# B = base character (consonant, independent vowel, etc)
|
|
||||||
$KhmerBase = [\u1780-\u17B3];
|
|
||||||
# R = robat
|
|
||||||
$KhmerRobat = [\u17CC];
|
|
||||||
# C = consonant shifter
|
|
||||||
$KhmerShifter = [\u17C9\u17CA];
|
|
||||||
# S = subscript consonant or independent vowel sign
|
|
||||||
$KhmerSub = ([\u17D2] $KhmerBase);
|
|
||||||
# V = dependent vowel sign
|
|
||||||
$KhmerVowel = [\u17B4-\u17C5];
|
|
||||||
# Z = zero-width joiner or non-joiner
|
|
||||||
$KhmerZWC = [\u200C\u200D];
|
|
||||||
# O = any other sign
|
|
||||||
$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
|
|
||||||
|
|
||||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
|
||||||
|
|
||||||
$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
|
|
||||||
|
|
||||||
$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
|
|
||||||
|
|
||||||
#
|
|
||||||
# default numerical definitions
|
|
||||||
#
|
|
||||||
$Extend = [\p{Word_Break = Extend}];
|
|
||||||
$Format = [\p{Word_Break = Format}];
|
|
||||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
|
||||||
$MidNum = [\p{Word_Break = MidNum}];
|
|
||||||
$Numeric = [\p{Word_Break = Numeric}];
|
|
||||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|
||||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
|
||||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
|
||||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
|
||||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
|
||||||
|
|
||||||
!!forward;
|
|
||||||
$KhmerJoinedSyllableEx {200};
|
|
||||||
|
|
||||||
# default numeric rules
|
|
||||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
|
@ -62,7 +62,7 @@
|
||||||
07A6..07B0>
|
07A6..07B0>
|
||||||
07EB..07F5>
|
07EB..07F5>
|
||||||
0818..0819>
|
0818..0819>
|
||||||
08E4..08FE>
|
08E3..08FE>
|
||||||
093C>
|
093C>
|
||||||
094D>
|
094D>
|
||||||
0951..0954>
|
0951..0954>
|
||||||
|
@ -149,7 +149,7 @@ AAF6>
|
||||||
AB5B..AB5F>
|
AB5B..AB5F>
|
||||||
ABEC..ABED>
|
ABEC..ABED>
|
||||||
FB1E>
|
FB1E>
|
||||||
FE20..FE2D>
|
FE20..FE2F>
|
||||||
FF3E>
|
FF3E>
|
||||||
FF40>
|
FF40>
|
||||||
FF70>
|
FF70>
|
||||||
|
@ -161,6 +161,7 @@ FFE3>
|
||||||
11133..11134>
|
11133..11134>
|
||||||
11173>
|
11173>
|
||||||
111C0>
|
111C0>
|
||||||
|
111CA..111CC>
|
||||||
11235..11236>
|
11235..11236>
|
||||||
112E9..112EA>
|
112E9..112EA>
|
||||||
1133C>
|
1133C>
|
||||||
|
@ -171,6 +172,7 @@ FFE3>
|
||||||
115BF..115C0>
|
115BF..115C0>
|
||||||
1163F>
|
1163F>
|
||||||
116B6..116B7>
|
116B6..116B7>
|
||||||
|
1172B>
|
||||||
16AF0..16AF4>
|
16AF0..16AF4>
|
||||||
16F8F..16F9F>
|
16F8F..16F9F>
|
||||||
1D167..1D169>
|
1D167..1D169>
|
||||||
|
|
|
@ -540,6 +540,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||||
116C7>0037 # TAKRI DIGIT SEVEN
|
116C7>0037 # TAKRI DIGIT SEVEN
|
||||||
116C8>0038 # TAKRI DIGIT EIGHT
|
116C8>0038 # TAKRI DIGIT EIGHT
|
||||||
116C9>0039 # TAKRI DIGIT NINE
|
116C9>0039 # TAKRI DIGIT NINE
|
||||||
|
11730>0030 # AHOM DIGIT ZERO
|
||||||
|
11731>0031 # AHOM DIGIT ONE
|
||||||
|
11732>0032 # AHOM DIGIT TWO
|
||||||
|
11733>0033 # AHOM DIGIT THREE
|
||||||
|
11734>0034 # AHOM DIGIT FOUR
|
||||||
|
11735>0035 # AHOM DIGIT FIVE
|
||||||
|
11736>0036 # AHOM DIGIT SIX
|
||||||
|
11737>0037 # AHOM DIGIT SEVEN
|
||||||
|
11738>0038 # AHOM DIGIT EIGHT
|
||||||
|
11739>0039 # AHOM DIGIT NINE
|
||||||
118E0>0030 # WARANG CITI DIGIT ZERO
|
118E0>0030 # WARANG CITI DIGIT ZERO
|
||||||
118E1>0031 # WARANG CITI DIGIT ONE
|
118E1>0031 # WARANG CITI DIGIT ONE
|
||||||
118E2>0032 # WARANG CITI DIGIT TWO
|
118E2>0032 # WARANG CITI DIGIT TWO
|
||||||
|
|
|
@ -35,9 +35,7 @@ import com.ibm.icu.util.ULocale;
|
||||||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||||
* but with the following tailorings:
|
* but with the following tailorings:
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary.
|
* <li>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.
|
||||||
* <li>Khmer text is broken into syllables
|
|
||||||
* based on custom BreakIterator rules.
|
|
||||||
* </ul>
|
* </ul>
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
@ -65,8 +63,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
// the same as ROOT, except no dictionary segmentation for cjk
|
// the same as ROOT, except no dictionary segmentation for cjk
|
||||||
private static final BreakIterator defaultBreakIterator =
|
private static final BreakIterator defaultBreakIterator =
|
||||||
readBreakIterator("Default.brk");
|
readBreakIterator("Default.brk");
|
||||||
private static final BreakIterator khmerBreakIterator =
|
|
||||||
readBreakIterator("Khmer.brk");
|
|
||||||
|
|
||||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||||
private final boolean cjkAsWords;
|
private final boolean cjkAsWords;
|
||||||
|
@ -91,7 +87,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
@Override
|
@Override
|
||||||
public BreakIterator getBreakIterator(int script) {
|
public BreakIterator getBreakIterator(int script) {
|
||||||
switch(script) {
|
switch(script) {
|
||||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
|
||||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||||
}
|
}
|
||||||
|
|
|
@ -353,7 +353,7 @@ and
|
||||||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||||
<p>
|
<p>
|
||||||
This module exists to provide up-to-date Unicode functionality that supports
|
This module exists to provide up-to-date Unicode functionality that supports
|
||||||
the most recent version of Unicode (currently 6.3). However, some users who wish
|
the most recent version of Unicode (currently 8.0). However, some users who wish
|
||||||
for stronger backwards compatibility can restrict
|
for stronger backwards compatibility can restrict
|
||||||
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
||||||
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -129,6 +129,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKhmer() throws Exception {
|
||||||
|
assertAnalyzesTo(a, "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ", new String[] { "ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", "ខ្នង", "នេះ" });
|
||||||
|
}
|
||||||
public void testLao() throws Exception {
|
public void testLao() throws Exception {
|
||||||
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||||
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
||||||
|
|
|
@ -39,7 +39,7 @@ com.google.inject.guice.version = 3.0
|
||||||
/com.google.protobuf/protobuf-java = 2.5.0
|
/com.google.protobuf/protobuf-java = 2.5.0
|
||||||
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
||||||
/com.googlecode.mp4parser/isoparser = 1.0.2
|
/com.googlecode.mp4parser/isoparser = 1.0.2
|
||||||
/com.ibm.icu/icu4j = 54.1
|
/com.ibm.icu/icu4j = 56.1
|
||||||
/com.pff/java-libpst = 0.8.1
|
/com.pff/java-libpst = 0.8.1
|
||||||
/com.spatial4j/spatial4j = 0.5
|
/com.spatial4j/spatial4j = 0.5
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
3f66ecd5871467598bc81662817b80612a0a907f
|
|
|
@ -0,0 +1 @@
|
||||||
|
8dd6671f52165a0419e6de5e1016400875a90fa9
|
|
@ -1 +0,0 @@
|
||||||
3f66ecd5871467598bc81662817b80612a0a907f
|
|
|
@ -0,0 +1 @@
|
||||||
|
8dd6671f52165a0419e6de5e1016400875a90fa9
|
Loading…
Reference in New Issue