LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.

This commit is contained in:
Robert Muir 2016-02-17 20:01:27 -05:00
parent 31437c9b43
commit b0a43aa1b2
15 changed files with 26 additions and 73 deletions

View File

@ -127,6 +127,10 @@ Tests
expression to encapsulate a statement that is expected to throw an exception. expression to encapsulate a statement that is expected to throw an exception.
(Ryan Ernst) (Ryan Ernst)
Other
* LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
======================= Lucene 5.6.0 ======================= ======================= Lucene 5.6.0 =======================
(No Changes) (No Changes)

View File

@ -1,61 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# Parses Khmer text, with orthographic syllable as token.
#
# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
#
# B = base character (consonant, independent vowel, etc)
$KhmerBase = [\u1780-\u17B3];
# R = robat
$KhmerRobat = [\u17CC];
# C = consonant shifter
$KhmerShifter = [\u17C9\u17CA];
# S = subscript consonant or independent vowel sign
$KhmerSub = ([\u17D2] $KhmerBase);
# V = dependent vowel sign
$KhmerVowel = [\u17B4-\u17C5];
# Z = zero-width joiner or non-joiner
$KhmerZWC = [\u200C\u200D];
# O = any other sign
$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
$WordJoin = [:Line_Break=Word_Joiner:];
$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$KhmerJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -62,7 +62,7 @@
07A6..07B0> 07A6..07B0>
07EB..07F5> 07EB..07F5>
0818..0819> 0818..0819>
08E4..08FE> 08E3..08FE>
093C> 093C>
094D> 094D>
0951..0954> 0951..0954>
@ -149,7 +149,7 @@ AAF6>
AB5B..AB5F> AB5B..AB5F>
ABEC..ABED> ABEC..ABED>
FB1E> FB1E>
FE20..FE2D> FE20..FE2F>
FF3E> FF3E>
FF40> FF40>
FF70> FF70>
@ -161,6 +161,7 @@ FFE3>
11133..11134> 11133..11134>
11173> 11173>
111C0> 111C0>
111CA..111CC>
11235..11236> 11235..11236>
112E9..112EA> 112E9..112EA>
1133C> 1133C>
@ -171,6 +172,7 @@ FFE3>
115BF..115C0> 115BF..115C0>
1163F> 1163F>
116B6..116B7> 116B6..116B7>
1172B>
16AF0..16AF4> 16AF0..16AF4>
16F8F..16F9F> 16F8F..16F9F>
1D167..1D169> 1D167..1D169>

View File

@ -540,6 +540,16 @@ ABF9>0039 # MEETEI MAYEK DIGIT NINE
116C7>0037 # TAKRI DIGIT SEVEN 116C7>0037 # TAKRI DIGIT SEVEN
116C8>0038 # TAKRI DIGIT EIGHT 116C8>0038 # TAKRI DIGIT EIGHT
116C9>0039 # TAKRI DIGIT NINE 116C9>0039 # TAKRI DIGIT NINE
11730>0030 # AHOM DIGIT ZERO
11731>0031 # AHOM DIGIT ONE
11732>0032 # AHOM DIGIT TWO
11733>0033 # AHOM DIGIT THREE
11734>0034 # AHOM DIGIT FOUR
11735>0035 # AHOM DIGIT FIVE
11736>0036 # AHOM DIGIT SIX
11737>0037 # AHOM DIGIT SEVEN
11738>0038 # AHOM DIGIT EIGHT
11739>0039 # AHOM DIGIT NINE
118E0>0030 # WARANG CITI DIGIT ZERO 118E0>0030 # WARANG CITI DIGIT ZERO
118E1>0031 # WARANG CITI DIGIT ONE 118E1>0031 # WARANG CITI DIGIT ONE
118E2>0032 # WARANG CITI DIGIT TWO 118E2>0032 # WARANG CITI DIGIT TWO

View File

@ -35,9 +35,7 @@ import com.ibm.icu.util.ULocale;
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings: * but with the following tailorings:
* <ul> * <ul>
* <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary. * <li>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary.
* <li>Khmer text is broken into syllables
* based on custom BreakIterator rules.
* </ul> * </ul>
* @lucene.experimental * @lucene.experimental
*/ */
@ -65,8 +63,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
// the same as ROOT, except no dictionary segmentation for cjk // the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator = private static final BreakIterator defaultBreakIterator =
readBreakIterator("Default.brk"); readBreakIterator("Default.brk");
private static final BreakIterator khmerBreakIterator =
readBreakIterator("Khmer.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff... // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
private final boolean cjkAsWords; private final boolean cjkAsWords;
@ -91,7 +87,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
@Override @Override
public BreakIterator getBreakIterator(int script) { public BreakIterator getBreakIterator(int script) {
switch(script) { switch(script) {
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone(); case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
default: return (BreakIterator)defaultBreakIterator.clone(); default: return (BreakIterator)defaultBreakIterator.clone();
} }

View File

@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1> <h1><a name="backcompat">Backwards Compatibility</a></h1>
<p> <p>
This module exists to provide up-to-date Unicode functionality that supports This module exists to provide up-to-date Unicode functionality that supports
the most recent version of Unicode (currently 6.3). However, some users who wish the most recent version of Unicode (currently 8.0). However, some users who wish
for stronger backwards compatibility can restrict for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.

View File

@ -129,6 +129,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" }); "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
} }
public void testKhmer() throws Exception {
assertAnalyzesTo(a, "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ", new String[] { "ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", "ខ្នង", "នេះ" });
}
public void testLao() throws Exception { public void testLao() throws Exception {
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" }); assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" }); assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });

View File

@ -39,7 +39,7 @@ com.google.inject.guice.version = 3.0
/com.google.protobuf/protobuf-java = 2.5.0 /com.google.protobuf/protobuf-java = 2.5.0
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3 /com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
/com.googlecode.mp4parser/isoparser = 1.0.2 /com.googlecode.mp4parser/isoparser = 1.0.2
/com.ibm.icu/icu4j = 54.1 /com.ibm.icu/icu4j = 56.1
/com.pff/java-libpst = 0.8.1 /com.pff/java-libpst = 0.8.1
/com.spatial4j/spatial4j = 0.5 /com.spatial4j/spatial4j = 0.5

View File

@ -1 +0,0 @@
3f66ecd5871467598bc81662817b80612a0a907f

View File

@ -0,0 +1 @@
8dd6671f52165a0419e6de5e1016400875a90fa9

View File

@ -1 +0,0 @@
3f66ecd5871467598bc81662817b80612a0a907f

View File

@ -0,0 +1 @@
8dd6671f52165a0419e6de5e1016400875a90fa9