LUCENE-7035: Upgrade icu4j to 56.1/unicode 8.

2016-02-17 20:01:27 -05:00 · 2016-02-17 20:01:27 -05:00 · b0a43aa1b2
parent 31437c9b43
commit b0a43aa1b2
15 changed files with 26 additions and 73 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -127,6 +127,10 @@ Tests
  expression to encapsulate a statement that is expected to throw an exception.
  (Ryan Ernst)
 Other
 * LUCENE-7035: Upgrade icu4j to 56.1/unicode 8. (Robert Muir)
 ======================= Lucene 5.6.0 =======================
 (No Changes)
--- a/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Khmer.rbbi
@ -1,61 +0,0 @@
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # 
 # Parses Khmer text, with orthographic syllable as token.
 #
 # The definition of Khmer orthographic syllable is taken from the Unicode Standard.
 #
 # B = base character (consonant, independent vowel, etc)
 $KhmerBase = [\u1780-\u17B3];
 # R = robat
 $KhmerRobat = [\u17CC];
 # C = consonant shifter
 $KhmerShifter = [\u17C9\u17CA];
 # S = subscript consonant or independent vowel sign
 $KhmerSub = ([\u17D2] $KhmerBase);
 # V = dependent vowel sign
 $KhmerVowel = [\u17B4-\u17C5];
 # Z = zero-width joiner or non-joiner
 $KhmerZWC = [\u200C\u200D];
 # O = any other sign
 $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; 
 $WordJoin = [:Line_Break=Word_Joiner:];
 $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
 $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
 #
 # default numerical definitions
 #
 $Extend       = [\p{Word_Break = Extend}];
 $Format       = [\p{Word_Break = Format}];
 $MidNumLet    = [\p{Word_Break = MidNumLet}];
 $MidNum       = [\p{Word_Break = MidNum}];
 $Numeric      = [\p{Word_Break = Numeric}];
 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
 $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
 $MidNumEx       = $MidNum       ($Extend |  $Format)*;
 $NumericEx      = $Numeric      ($Extend |  $Format)*;
 $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
 !!forward;
 $KhmerJoinedSyllableEx {200};
 # default numeric rules
 $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
--- a/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/DiacriticFolding.txt
@ -62,7 +62,7 @@
 07A6..07B0>
 07EB..07F5>
 0818..0819>
-08E4..08FE>
+08E3..08FE>
 093C>
 094D>
 0951..0954>
@ -149,7 +149,7 @@ AAF6>
 AB5B..AB5F>
 ABEC..ABED>
 FB1E>
-FE20..FE2D>
+FE20..FE2F>
 FF3E>
 FF40>
 FF70>
@ -161,6 +161,7 @@ FFE3>
 11133..11134>
 11173>
 111C0>
 111CA..111CC>
 11235..11236>
 112E9..112EA>
 1133C>
@ -171,6 +172,7 @@ FFE3>
 115BF..115C0>
 1163F>
 116B6..116B7>
 1172B>
 16AF0..16AF4>
 16F8F..16F9F>
 1D167..1D169>
--- a/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
+++ b/lucene/analysis/icu/src/data/utr30/NativeDigitFolding.txt
@ -540,6 +540,16 @@ ABF9>0039   # MEETEI MAYEK DIGIT NINE
 116C7>0037   # TAKRI DIGIT SEVEN
 116C8>0038   # TAKRI DIGIT EIGHT
 116C9>0039   # TAKRI DIGIT NINE
 11730>0030   # AHOM DIGIT ZERO
 11731>0031   # AHOM DIGIT ONE
 11732>0032   # AHOM DIGIT TWO
 11733>0033   # AHOM DIGIT THREE
 11734>0034   # AHOM DIGIT FOUR
 11735>0035   # AHOM DIGIT FIVE
 11736>0036   # AHOM DIGIT SIX
 11737>0037   # AHOM DIGIT SEVEN
 11738>0038   # AHOM DIGIT EIGHT
 11739>0039   # AHOM DIGIT NINE
 118E0>0030   # WARANG CITI DIGIT ZERO
 118E1>0031   # WARANG CITI DIGIT ONE
 118E2>0032   # WARANG CITI DIGIT TWO
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@ -35,9 +35,7 @@ import com.ibm.icu.util.ULocale;
 * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), 
 * but with the following tailorings:
 * <ul>
- *   <li>Thai, Lao, Myanmar, and CJK text is broken into words with a dictionary. 
+ *   <li>Thai, Lao, Myanmar, Khmer, and CJK text is broken into words with a dictionary. 
 *   <li>Khmer text is broken into syllables
 *   based on custom BreakIterator rules.
 * </ul>
 * @lucene.experimental
 */
@ -65,8 +63,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
  // the same as ROOT, except no dictionary segmentation for cjk
  private static final BreakIterator defaultBreakIterator = 
    readBreakIterator("Default.brk");
  private static final BreakIterator khmerBreakIterator = 
    readBreakIterator("Khmer.brk");
  // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
  private final boolean cjkAsWords;
@ -91,7 +87,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
  @Override
  public BreakIterator getBreakIterator(int script) {
    switch(script) {
      case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
      case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
      default: return (BreakIterator)defaultBreakIterator.clone();
    }
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 6.3). However, some users who wish
+the most recent version of Unicode (currently 8.0). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@ -129,6 +129,9 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
        "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
  }
  public void testKhmer() throws Exception {
    assertAnalyzesTo(a, "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ", new String[] { "ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", "ខ្នង", "នេះ" });
  }
  public void testLao() throws Exception {
    assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
    assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@ -39,7 +39,7 @@ com.google.inject.guice.version = 3.0
 /com.google.protobuf/protobuf-java = 2.5.0
 /com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
 /com.googlecode.mp4parser/isoparser = 1.0.2
-/com.ibm.icu/icu4j = 54.1
+/com.ibm.icu/icu4j = 56.1
 /com.pff/java-libpst = 0.8.1
 /com.spatial4j/spatial4j = 0.5
--- a/lucene/licenses/icu4j-54.1.jar.sha1
+++ b/lucene/licenses/icu4j-54.1.jar.sha1
@ -1 +0,0 @@
 3f66ecd5871467598bc81662817b80612a0a907f
--- a/lucene/licenses/icu4j-56.1.jar.sha1
+++ b/lucene/licenses/icu4j-56.1.jar.sha1
@ -0,0 +1 @@
 8dd6671f52165a0419e6de5e1016400875a90fa9
--- a/solr/licenses/icu4j-54.1.jar.sha1
+++ b/solr/licenses/icu4j-54.1.jar.sha1
@ -1 +0,0 @@
 3f66ecd5871467598bc81662817b80612a0a907f
--- a/solr/licenses/icu4j-56.1.jar.sha1
+++ b/solr/licenses/icu4j-56.1.jar.sha1
@ -0,0 +1 @@
 8dd6671f52165a0419e6de5e1016400875a90fa9