LUCENE-4381: upgrade ICU to icu4j 52.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1547502 13f79535-47bb-0310-9956-ffa450edef68
2013-12-03 18:05:23 +00:00 · 2013-12-03 18:05:23 +00:00 · b5dbac5e35
parent 5f5098299a
commit b5dbac5e35
37 changed files with 281 additions and 667 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -89,6 +89,8 @@ Build
 * LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
  (Uwe Schindler)

+* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
+
 Bug fixes

 * LUCENE-5285: Improved highlighting of multi-valued fields with
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-// Generated using ICU4J 49.1.0.0
+// Generated using ICU4J 52.1.0.0
 // by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros


--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-// Generated using ICU4J 49.1.0.0
+// Generated using ICU4J 52.1.0.0
 // by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros


--- a/lucene/analysis/icu/src/data/uax29/Default.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Default.rbbi
@ -14,27 +14,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Default RBBI rules, based on UAX#29.
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
 #
+# Copyright (C) 2002-2013, International Business Machines Corporation 
+# and others. All Rights Reserved.
+#
+# file:  word.txt
+#
+# ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
+#
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
+
+##############################################################################
+#
+#  Character class definitions from TR 29
+#
+##############################################################################

 !!chain;

+
 #
 #  Character Class Definitions.
 #

-$CR           = [\p{Word_Break = CR}];
-$LF           = [\p{Word_Break = LF}];
-$Newline      = [\p{Word_Break = Newline}];
-$Extend       = [\p{Word_Break = Extend}];
-$Format       = [\p{Word_Break = Format}];
-$Katakana     = [\p{Word_Break = Katakana}];
-$ALetter      = [\p{Word_Break = ALetter}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
-$MidLetter    = [\p{Word_Break = MidLetter}];
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidLetter          = [\p{Word_Break = MidLetter}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+
+$Han                = [:Han:];
+$Hiragana           = [:Hiragana:];


 #   Dictionary character set, for triggering language-based break engines. Currently
@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.

-$dictionary   = [:LineBreak = Complex_Context:];
 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
-                                                             #  include the dictionary characters.
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
+$dictionary     = [$ComplexContext];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+

 #
 #  Rules 4    Ignore Format and Extend characters, 
 #             except when they appear at the beginning of a region of text.
 #
-$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
-$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format)*;
+$NumericEx            = $Numeric            ($Extend |  $Format)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;

-$Hiragana       = [\p{script=Hiragana}];
 $Ideographic    = [\p{Ideographic}];
 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
@ -77,23 +112,31 @@ $CR $LF;
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
-#          format char(s).
+#          format char(s), or is not a CJK dictionary character.
 [^$CR $LF $Newline]? ($Extend |  $Format)+;

 $NumericEx {100};
 $ALetterEx {200};
+$HangulSyllable {200};
+$Hebrew_LetterEx{200};
 $KatakanaEx {300};       # note:  these status values override those from rule 5
-$HiraganaEx {300};       #        by virtual of being numerically larger.
+$HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #

 #
 # rule 5
 #    Do not break between most letters.
 #
-$ALetterEx $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};

 # rule 6 and 7
-$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
+
+# rule 7a
+$Hebrew_LetterEx $Single_QuoteEx {200};
+
+# rule 7b and 7c
+$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};

 # rule 8

@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};

 # rule 9

-$ALetterEx $NumericEx {200};
+($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};

 # rule 10

-$NumericEx $ALetterEx {200};
+$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};

 # rule 11 and 12 

-$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};

 # rule 13
-
 $KatakanaEx  $KatakanaEx {300};

 # rule 13a/b

-$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
-$NumericEx      $ExtendNumLetEx {100};    #  (13a)
-$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
-$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
+$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
+$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
+$NumericEx       $ExtendNumLetEx {100};    #  (13a)
+$KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
+$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)

-$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
-$ExtendNumLetEx $NumericEx  {100};    #  (13b)
-$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
+$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
+$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
+$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
+$ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
+
+# rule 13c
+
+$Regional_IndicatorEx $Regional_IndicatorEx;
+
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
--- a/lucene/analysis/icu/src/data/uax29/Hebrew.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Hebrew.rbbi
@ -1,61 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# This is an example of rule tailoring for Hebrew.
-# In this example the single-quote is added to the Extend category
-# The double-quote is added to the MidLetter category.
-#
-!!chain;
-$CR           = [\p{Word_Break = CR}];
-$LF           = [\p{Word_Break = LF}];
-$Newline      = [\p{Word_Break = Newline}];
-$Extend       = [\p{Word_Break = Extend}\u0027];
-$Format       = [\p{Word_Break = Format}];
-$ALetter      = [\p{Word_Break = ALetter}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
-$MidLetter    = [\p{Word_Break = MidLetter}\u0022];
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
-$dictionary   = [:LineBreak = Complex_Context:];
-$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]]; 
-                                                              
-$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
-
-!!forward;
-
-$CR $LF;
-[^$CR $LF $Newline]? ($Extend |  $Format)+;
-$NumericEx {100};
-$ALetterEx {200};    
-$ALetterEx $ALetterEx {200};
-$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
-$NumericEx $NumericEx {100};
-$ALetterEx $NumericEx {200};
-$NumericEx $ALetterEx {200};
-$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
-$ALetterEx      $ExtendNumLetEx {200};    
-$NumericEx      $ExtendNumLetEx {100};      
-$ExtendNumLetEx $ExtendNumLetEx {200};    
-$ExtendNumLetEx $ALetterEx  {200};    
-$ExtendNumLetEx $NumericEx  {100};    
--- a/lucene/analysis/icu/src/data/uax29/Lao.rbbi
+++ b/lucene/analysis/icu/src/data/uax29/Lao.rbbi
@ -1,192 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Parses Lao text, with syllable as token.
-#
-# The definition of Lao syllable is based from:
-#
-#   Syllabification of Lao Script for Line Breaking
-#   Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
-#     Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
-#   http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
-#	http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
-#
-# NOTE:
-# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
-# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
-#
-# Syllable structure, where X is the nuclear consonant:
-#
-#           +----+
-#           | X5 |
-#           +----+
-#           | X4 |
-# +----+----+----+----+----+----+----+-----+
-# | X0 | X1 | X  | X6 | X7 | X8 | X9 | X10 |
-# +----+----+----+----+----+----+----+-----+
-#           | X2 |
-#           +----+
-#           | X3 |
-#           +----+
-#
-# X0 represents a vowel which occurs before the nuclear consonant. 
-# It can always define the beginning of syllable.
-$X0 = [\u0EC0-\u0EC4];
-# X1 is a combination consonant which comes before the nuclear consonant, 
-# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
-$X1 = [\u0EAB];
-# X represents the nuclear consonant.
-$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
-# X2 is a combination consonant which comes after the nuclear consonant, 
-# which is placed under or next to the nuclear consonant.
-$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
-# X3 represents a vowel which occurs under the nuclear consonant.
-$X3 = [\u0EB8\u0EB9];
-# X4 represents a vowel which occurs above the nuclear consonant. 
-$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
-# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
-$X5 = [\u0EC8-\u0ECB];
-# X6 represents a consonant vowel, which occurs after the nuclear consonant. 
-# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
-$X6 = [\u0EA7\u0EAD\u0EBD];
-# X7 represents a final vowel. 
-# However X7_1 always represents the end of syllable and it never exists with tone mark.
-$X7 = [\u0EB0\u0EB2\u0EB3];
-# X8 represents an alternate consonant.
-$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
-# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
-$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
-# X10 represents a sign mark. 
-# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
-$X10 = [\u0EAF\u0EC6\u0ECC];
-
-# Section 1
-$X0_1 = [\u0EC0];
-$X4_1_2 = [\u0EB4\u0EB5];
-$X4_3_4 = [\u0EB6\u0EB7];
-$X4_6 = [\u0EBB];
-$X4_7 = [\u0EB1];
-$X6_2 = [\u0EAD];
-$X6_3 = [\u0EBD];
-$X7_1 = [\u0EB0];
-$X7_2 = [\u0EB2];
-$X10_1 = [\u0EAF];
-$X10_2 = [\u0EC6];
-$X10_3 = [\u0ECC];
-
-$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
-$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
-
-# Section 2
-$X0_2 = [\u0EC1];
-
-$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
-$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; 
-
-$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
-
-# Section 3
-$X0_3 = [\u0EC2];
-$X8_3 = [\u0E8D];
-$X8_8 = [\u0EA7];
-
-$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
-$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
-
-$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
-
-# Section 4
-$X0_4 = [\u0EC4];
-$X6_1 = [\u0EA7];
-
-$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 5
-$X0_5 = [\u0EC3];
-
-$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 6
-$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 7
-$X4_1_4 = [\u0EB4-\u0EB7];
-
-$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 8
-$X4_5 = [\u0ECD];
-
-$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 9
-
-$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
-
-$Rule9 = ($Rule9_1 | $Rule9_2);
-
-# Section 10
-$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 11
-$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 12
-$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
-
-# Section 13
-$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-# Section 14
-$X7_3 = [\u0EB3];
-
-$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
-
-$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
-
-$WordJoin = [:Line_Break=Word_Joiner:];
-
-$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
-
-#
-# default numerical definitions
-#
-$Extend       = [\p{Word_Break = Extend}];
-$Format       = [\p{Word_Break = Format}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                                                          
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
-
-!!forward;
-
-$LaoJoinedSyllableEx {200};
-# default numeric rules
-$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};
--- a/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
+++ b/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
@ -78,7 +78,6 @@ FF0D>002D
 ## Space Folding
 # Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
 1680>0020
-180E>0020

 ## Spacing Accents folding (done by kd)

--- a/lucene/analysis/icu/src/data/utr30/nfc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfc.txt
@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
 # file name: nfc.txt
@ -7,7 +7,7 @@
 #
 # Complete data for Unicode NFC normalization.

-* Unicode 6.1.0
+* Unicode 6.3.0

 # Canonical_Combining_Class (ccc) values
 0300..0314:230
--- a/lucene/analysis/icu/src/data/utr30/nfkc.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc.txt
@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
 # file name: nfkc.txt
@ -11,7 +11,7 @@
 # to NFKC one-way mappings.
 # Use this file as the second gennorm2 input file after nfc.txt.

-* Unicode 6.1.0
+* Unicode 6.3.0

 00A0>0020
 00A8>0020 0308
--- a/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
+++ b/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
@ -1,5 +1,5 @@
 # Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2013 Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 # For documentation, see http://www.unicode.org/reports/tr44/
 #
@ -12,7 +12,7 @@
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
 # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.

-* Unicode 6.1.0
+* Unicode 6.3.0

 0041>0061
 0042>0062
@ -537,6 +537,7 @@
 0555>0585
 0556>0586
 0587>0565 0582
+061C>
 0675>0627 0674
 0676>0648 0674
 0677>06C7 0674
@ -627,7 +628,7 @@
 10FC>10DC
 115F..1160>
 17B4..17B5>
-180B..180D>
+180B..180E>
 1D2C>0061
 1D2D>00E6
 1D2E>0062
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java
@ -21,7 +21,6 @@ import java.text.CharacterIterator;

 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.DictionaryBasedBreakIterator;
 import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.text.UTF16;

@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
  }

  /**
-   * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
-   * treat it like a generic BreakIterator If its any other
-   * RuleBasedBreakIterator, the rule status can be used for token type. If its
+   * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
   * any other BreakIterator, the rulestatus method is not available, so treat
   * it like a generic BreakIterator.
   */
  static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
-    if (breakIterator instanceof RuleBasedBreakIterator
-        && !(breakIterator instanceof DictionaryBasedBreakIterator))
+    if (breakIterator instanceof RuleBasedBreakIterator)
      return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
    else
      return new BIWrapper(breakIterator);
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java
@ -41,12 +41,13 @@ final class CompositeBreakIterator {
  private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];

  private BreakIteratorWrapper rbbi;
-  private final ScriptIterator scriptIterator = new ScriptIterator();
+  private final ScriptIterator scriptIterator;

  private char text[];

  CompositeBreakIterator(ICUTokenizerConfig config) {
    this.config = config;
+    this.scriptIterator = new ScriptIterator(config.combineCJ());
  }

  /**
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
 * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), 
 * but with the following tailorings:
 * <ul>
- *   <li>Thai text is broken into words with a 
- *   {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
- *   <li>Lao, Myanmar, and Khmer text is broken into syllables
+ *   <li>Thai, Lao, and CJK text is broken into words with a dictionary. 
+ *   <li>Myanmar, and Khmer text is broken into syllables
 *   based on custom BreakIterator rules.
- *   <li>Hebrew text has custom tailorings to handle special cases
- *   involving punctuation.
 * </ul>
 * @lucene.experimental
 */
@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
   * the default breakiterators in use. these can be expensive to
   * instantiate, cheap to clone.
   */  
-  private static final BreakIterator rootBreakIterator = 
+  // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
+  // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
+  private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
+  // the same as ROOT, except no dictionary segmentation for cjk
+  private static final BreakIterator defaultBreakIterator = 
    readBreakIterator("Default.brk");
-  private static final BreakIterator thaiBreakIterator = 
-    BreakIterator.getWordInstance(new ULocale("th_TH"));
-  private static final BreakIterator hebrewBreakIterator = 
-    readBreakIterator("Hebrew.brk");
  private static final BreakIterator khmerBreakIterator = 
    readBreakIterator("Khmer.brk");
-  private static final BreakIterator laoBreakIterator = 
-    new LaoBreakIterator(readBreakIterator("Lao.brk"));
  private static final BreakIterator myanmarBreakIterator = 
    readBreakIterator("Myanmar.brk");
  
+  // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
+  private final boolean cjkAsWords;
+  
  /** 
   * Creates a new config. This object is lightweight, but the first
   * time the class is referenced, breakiterators will be initialized.
+   * @param cjkAsWords true if cjk text should undergo dictionary-based segmentation, 
+   *                   otherwise text will be segmented according to UAX#29 defaults.
+   *                   If this is true, all Han+Hiragana+Katakana words will be tagged as
+   *                   IDEOGRAPHIC.
   */
-  public DefaultICUTokenizerConfig() {}
+  public DefaultICUTokenizerConfig(boolean cjkAsWords) { 
+    this.cjkAsWords = cjkAsWords;
+  }
+  
+  @Override
+  public boolean combineCJ() {
+    return cjkAsWords;
+  }

  @Override
  public BreakIterator getBreakIterator(int script) {
    switch(script) {
-      case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
-      case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
      case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
-      case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
      case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
-      default: return (BreakIterator)rootBreakIterator.clone();
+      case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
+      default: return (BreakIterator)defaultBreakIterator.clone();
    }
  }

--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java
@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
   * @see DefaultICUTokenizerConfig
   */
  public ICUTokenizer(Reader input) {
-    this(input, new DefaultICUTokenizerConfig());
+    this(input, new DefaultICUTokenizerConfig(true));
  }

  /**
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java
@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig {
  /** Return a token type value for a given script and BreakIterator
   *  rule status. */
  public abstract String getType(int script, int ruleStatus);
+  /** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
+  public abstract boolean combineCJ();
 }
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java
@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
 * <pre class="prettyprint" >
 * &lt;fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
- *     &lt;tokenizer class="solr.ICUTokenizerFactory"
+ *     &lt;tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
 *                rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
  static final String RULEFILES = "rulefiles";
  private final Map<Integer,String> tailored;
  private ICUTokenizerConfig config;
+  private final boolean cjkAsWords;
  
  /** Creates a new ICUTokenizerFactory */
  public ICUTokenizerFactory(Map<String,String> args) {
@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
        tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
      }
    }
+    cjkAsWords = getBoolean(args, "cjkAsWords", true);
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
  public void inform(ResourceLoader loader) throws IOException {
    assert tailored != null : "init must be called first!";
    if (tailored.isEmpty()) {
-      config = new DefaultICUTokenizerConfig();
+      config = new DefaultICUTokenizerConfig(cjkAsWords);
    } else {
      final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
      for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
        String resourcePath = entry.getValue();
        breakers[code] = parseRules(resourcePath, loader);
      }
-      config = new DefaultICUTokenizerConfig() {
+      config = new DefaultICUTokenizerConfig(cjkAsWords) {
        
        @Override
        public BreakIterator getBreakIterator(int script) {
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java
@ -1,230 +0,0 @@
-package org.apache.lucene.analysis.icu.segmentation;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.text.CharacterIterator;
-
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.RuleBasedBreakIterator;
-import com.ibm.icu.text.UnicodeSet;
-
-/**
- * Syllable iterator for Lao text.
- * <p>
- * This breaks Lao text into syllables according to:
- * <i>Syllabification of Lao Script for Line Breaking</i>
- * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, 
- * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
- * <ul>
- *  <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
- *  <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
- * </ul>
- * <p>
- * Most work is accomplished with RBBI rules, however some additional special logic is needed
- * that cannot be coded in a grammar, and this is implemented here.
- * <p>
- * For example, what appears to be a final consonant might instead be part of the next syllable.
- * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
- * <p>
- * Take for instance the text ກວ່າດອກ
- * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
- * What LaoBreakIterator does, according to the paper:
- * <ol>
- *  <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
- *  <li>verify the modified previous syllable (ກວ່າ ) is still legal.
- *  <li>verify the modified current syllable (ດອກ) is now legal.
- *  <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
- * </ol>
- * <p>
- * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
- * This is the issue of combining marks being in the wrong order (typos).
- * @lucene.experimental
- */
-public class LaoBreakIterator extends BreakIterator {
-  RuleBasedBreakIterator rules;
-  CharArrayIterator text;
-  
-  CharArrayIterator working = new CharArrayIterator();
-  int workingOffset = 0;
-  
-  CharArrayIterator verifyText = new CharArrayIterator();
-  RuleBasedBreakIterator verify;
-  
-  private static final UnicodeSet laoSet;
-  static {
-    laoSet = new UnicodeSet("[:Lao:]");
-    laoSet.compact();
-    laoSet.freeze();
-  }
-  
-  /** 
-   * Creates a new iterator, performing the backtracking verification
-   * across the provided <code>rules</code>.
-   */
-  public LaoBreakIterator(RuleBasedBreakIterator rules) {
-    this.rules = (RuleBasedBreakIterator) rules.clone();
-    this.verify = (RuleBasedBreakIterator) rules.clone();
-  }
-
-  @Override
-  public int current() {
-    int current = rules.current();
-    return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
-  }
-
-  @Override
-  public int first() {
-    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
-    rules.setText(working);
-    workingOffset = 0;
-    int first = rules.first();
-    return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
-  }
-
-  @Override
-  public int following(int offset) {
-    throw new UnsupportedOperationException();
-  }
-
-  @Override
-  public CharacterIterator getText() {
-    return text;
-  }
-
-  @Override
-  public int last() {
-    throw new UnsupportedOperationException();
-  }
-  
-  @Override
-  public int next() {
-    int current = current();
-    int next = rules.next();
-    if (next == BreakIterator.DONE)
-      return next;
-    else
-      next += workingOffset;
-    
-    char c = working.current();
-    int following = rules.next(); // lookahead
-    if (following != BreakIterator.DONE) {
-      following += workingOffset;
-      if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
-        workingOffset = next - 1;
-        working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
-        return next - 1;
-      }
-    rules.previous(); // undo the lookahead
-    }
-    
-    return next;
-  }
-
-  @Override
-  public int next(int n) {
-    if (n < 0)
-      throw new UnsupportedOperationException("Backwards traversal is unsupported");
-
-    int result = current();
-    while (n > 0) {
-        result = next();
-        --n;
-    }
-    return result;
-  }
-
-  @Override
-  public int previous() {
-    throw new UnsupportedOperationException("Backwards traversal is unsupported");
-  }
-
-  @Override
-  public void setText(CharacterIterator text) {
-    if (!(text instanceof CharArrayIterator))
-      throw new UnsupportedOperationException("unsupported CharacterIterator");
-    this.text = (CharArrayIterator) text;
-    ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
-    working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
-    rules.setText(working);
-    workingOffset = 0;
-  }
-  
-  @Override
-  public void setText(String newText) {
-    CharArrayIterator ci = new CharArrayIterator();
-    ci.setText(newText.toCharArray(), 0, newText.length());
-    setText(ci);
-  }
-  
-  private boolean verifyPushBack(int current, int next) {
-    int shortenedSyllable = next - current - 1;
-
-    verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
-    verify.setText(verifyText);
-    if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
-      return false;
-    
-
-    verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
-    verify.setText(verifyText);
-
-    return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
-  }
-
-  // TODO: only bubblesort around runs of combining marks, instead of the entire text.
-  private void ccReorder(char[] text, int start, int length) {
-    boolean reordered;
-    do {
-      int prevCC = 0;
-      reordered = false;
-      for (int i = start; i < start + length; i++) {
-        final char c = text[i];
-        final int cc = UCharacter.getCombiningClass(c);
-        if (cc > 0 && cc < prevCC) {
-          // swap
-          text[i] = text[i - 1];
-          text[i - 1] = c;
-          reordered = true;
-        } else {
-          prevCC = cc;
-        }
-      }
-
-    } while (reordered == true);
-  }
-  
-  /**
-   * Clone method.  Creates another LaoBreakIterator with the same behavior 
-   * and current state as this one.
-   * @return The clone.
-   */
-  @Override
-  public LaoBreakIterator clone() {
-    LaoBreakIterator other = (LaoBreakIterator) super.clone();
-    other.rules = (RuleBasedBreakIterator) rules.clone();
-    other.verify = (RuleBasedBreakIterator) verify.clone();
-    if (text != null)
-      other.text = text.clone();
-    if (working != null)
-      other.working = working.clone();
-    if (verifyText != null)
-      other.verifyText = verifyText.clone();
-    return other;
-  }
-}
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java
@ -59,6 +59,15 @@ final class ScriptIterator {
  private int scriptStart;
  private int scriptLimit;
  private int scriptCode;
+  
+  private final boolean combineCJ;
+  
+  /**
+   * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
+   */
+  ScriptIterator(boolean combineCJ) {
+    this.combineCJ = combineCJ;
+  }

  /**
   * Get the start of this script run
@ -162,10 +171,24 @@ final class ScriptIterator {
  }

  /** fast version of UScript.getScript(). Basic Latin is an array lookup */
-  private static int getScript(int codepoint) {
-    if (0 <= codepoint && codepoint < basicLatin.length)
+  private int getScript(int codepoint) {
+    if (0 <= codepoint && codepoint < basicLatin.length) {
      return basicLatin[codepoint];
-    else
-      return UScript.getScript(codepoint);
+    } else {
+      int script = UScript.getScript(codepoint);
+      if (combineCJ) {
+        if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
+          return UScript.JAPANESE;
+        } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
+          // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
+          // they are treated as punctuation. we currently have no cleaner way to fix this!
+          return UScript.LATIN; 
+        } else {
+          return script;
+        }
+      } else {
+        return script;
+      }
+    }
  }
 }
--- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
+++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java
@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribut

  @Override
  public void reflectWith(AttributeReflector reflector) {
-    reflector.reflect(ScriptAttribute.class, "script", getName());
+    // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to 
+    // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset), 
+    // but this is just to help prevent confusion.
+    String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
+    reflector.reflect(ScriptAttribute.class, "script", name);
  }
 }
--- a/lucene/analysis/icu/src/java/overview.html
+++ b/lucene/analysis/icu/src/java/overview.html
@ -353,7 +353,7 @@ and
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
 This module exists to provide up-to-date Unicode functionality that supports
-the most recent version of Unicode (currently 6.1). However, some users who wish
+the most recent version of Unicode (currently 6.3). However, some users who wish
 for stronger backwards compatibility can restrict
 {@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
 a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}. 
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Default.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk
--- a/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
+++ b/lucene/analysis/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
    sb.append(whitespace);
    sb.append("testing 1234");
    String input = sb.toString();
-    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
    assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
  }
  
@ -52,7 +52,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
      sb.append('a');
    }
    String input = sb.toString();
-    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
+    ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
    char token[] = new char[4096];
    Arrays.fill(token, 'a');
    String expectedToken = new String(token);
@ -69,7 +69,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
    @Override
    protected TokenStreamComponents createComponents(String fieldName,
        Reader reader) {
-      Tokenizer tokenizer = new ICUTokenizer(reader);
+      Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
      TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
      return new TokenStreamComponents(tokenizer, filter);
    }
@ -118,6 +118,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
  
  public void testLao() throws Exception {
    assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
+    assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
  }
  
  public void testThai() throws Exception {
@ -138,6 +139,13 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
        new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
  }
  
+  public void testHebrew() throws Exception {
+    assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח",
+        new String[] { "דנקנר", "תקף", "את", "הדו\"ח" });
+    assertAnalyzesTo(a, "חברת בת של מודי'ס",
+        new String[] { "חברת", "בת", "של", "מודי'ס" });
+  }
+  
  public void testEmpty() throws Exception {
    assertAnalyzesTo(a, "", new String[] {});
    assertAnalyzesTo(a, ".", new String[] {});
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java
@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Random;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+/**
+ * test ICUTokenizer with dictionary-based CJ segmentation
+ */
+public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
+  Analyzer a = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      return new TokenStreamComponents(new ICUTokenizer(reader));
+    }
+  };
+  
+  /**
+   * test stolen from smartcn
+   */
+  public void testSimpleChinese() throws Exception {
+    assertAnalyzesTo(a, "我购买了道具和服装。",
+        new String[] { "我", "购买", "了", "道具", "和", "服装" }
+    );
+  }
+  
+  public void testChineseNumerics() throws Exception {
+    assertAnalyzesTo(a, "９４８３", new String[] { "９４８３" });
+    assertAnalyzesTo(a, "院內分機９４８３。",
+        new String[] { "院", "內", "分機", "９４８３" });
+    assertAnalyzesTo(a, "院內分機9483。",
+        new String[] { "院", "內", "分機", "9483" });
+  }
+  
+  /**
+   * test stolen from kuromoji
+   */
+  public void testSimpleJapanese() throws Exception {
+    assertAnalyzesTo(a, "それはまだ実験段階にあります",
+        new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます"  }
+    );
+  }
+  
+  public void testJapaneseTypes() throws Exception {
+    assertAnalyzesTo(a, "仮名遣い カタカナ",
+        new String[] { "仮名遣い", "カタカナ" },
+        new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
+  }
+  
+  public void testKorean() throws Exception {
+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+  }
+  
+  /** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */
+  public void testKoreanTypes() throws Exception {
+    assertAnalyzesTo(a, "훈민정음",
+        new String[] { "훈민정음" },
+        new String[] { "<HANGUL>" });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
+  }
+  
+  /** blast some random large strings through the analyzer */
+  public void testRandomHugeStrings() throws Exception {
+    Random random = random();
+    checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
+  }
+}
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java
@ -1,90 +0,0 @@
-package org.apache.lucene.analysis.icu.segmentation;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.InputStream;
-
-import org.apache.lucene.util.LuceneTestCase;
-
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.text.BreakIterator;
-import com.ibm.icu.text.RuleBasedBreakIterator;
-import com.ibm.icu.text.UTF16;
-
-/**
- * Tests LaoBreakIterator and its RBBI rules
- */
-public class TestLaoBreakIterator extends LuceneTestCase {
-  private BreakIterator wordIterator;
-  
-  @Override
-  public void setUp() throws Exception {
-    super.setUp();
-    InputStream is = getClass().getResourceAsStream("Lao.brk");
-    wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
-    is.close();
-  }
-  
-  private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
-    char text[] = sourceText.toCharArray();
-    CharArrayIterator ci = new CharArrayIterator();
-    ci.setText(text, 0, text.length);
-    iterator.setText(ci);
-    
-    for (int i = 0; i < tokens.length; i++) {
-      int start, end;
-      do {
-        start = iterator.current();
-        end = iterator.next();
-      } while (end != BreakIterator.DONE && !isWord(text, start, end));
-      assertTrue(start != BreakIterator.DONE);
-      assertTrue(end != BreakIterator.DONE);
-      assertEquals(tokens[i], new String(text, start, end - start));
-    }
-    
-    assertTrue(iterator.next() == BreakIterator.DONE);
-  }
-  
-  protected boolean isWord(char text[], int start, int end) {
-    int codepoint;
-    for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
-      codepoint = UTF16.charAt(text, 0, end, start);
-
-      if (UCharacter.isLetterOrDigit(codepoint))
-        return true;
-      }
-
-    return false;
-  }
-  
-  public void testBasicUsage() throws Exception {
-    assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
-    assertBreaksTo(wordIterator, "ຜູ້ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
-    assertBreaksTo(wordIterator, "", new String[] {});
-    assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
-  }
-  
-  public void testNumerics() throws Exception {
-    assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
-    assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
-  }
- 
-  public void testTextAndNumerics() throws Exception {
-    assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
-  }
-}
--- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
+++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
  private Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer source = new ICUTokenizer(reader);
+      Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
      TokenStream result = new CJKBigramFilter(source);
      return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
    }
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
  private Analyzer analyzer2 = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-      Tokenizer source = new ICUTokenizer(reader);
+      Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
      // we put this before the CJKBigramFilter, because the normalization might combine
      // some halfwidth katakana forms, which will affect the bigramming.
      TokenStream result = new ICUNormalizer2Filter(source);
--- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
+++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java
@ -62,7 +62,7 @@ import java.util.regex.Pattern;
 public class GenerateUTR30DataFiles {
  private static final String ICU_SVN_TAG_URL
      = "http://source.icu-project.org/repos/icu/icu/tags";
-  private static final String ICU_RELEASE_TAG = "release-49-1-2";
+  private static final String ICU_RELEASE_TAG = "release-52-1";
  private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
  private static final String NFC_TXT = "nfc.txt";
  private static final String NFKC_TXT = "nfkc.txt";
--- a/lucene/ivy-versions.properties
+++ b/lucene/ivy-versions.properties
@ -46,7 +46,7 @@ com.google.inject.guice.version = 3.0
 /com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
 /com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
 /com.googlecode.mp4parser/isoparser = 1.0-RC-1
-/com.ibm.icu/icu4j = 49.1
+/com.ibm.icu/icu4j = 52.1
 /com.spatial4j/spatial4j = 0.3

 com.sun.jersey.version = 1.8
--- a/lucene/licenses/icu4j-49.1.jar.sha1
+++ b/lucene/licenses/icu4j-49.1.jar.sha1
@ -1 +0,0 @@
-fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
--- a/lucene/licenses/icu4j-52.1.jar.sha1
+++ b/lucene/licenses/icu4j-52.1.jar.sha1
@ -0,0 +1 @@
+7dbc327670673acd14b487d120f05747d712c1c0
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -635,7 +635,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
    int charUpto = 0;
    final StringBuilder sb = new StringBuilder();
    while (charUpto < s.length()) {
-      final int c = s.codePointAt(charUpto);
+      final int c = s.charAt(charUpto);
      if (c == 0xa) {
        // Strangely, you cannot put \ u000A into Java
        // sources (not in a comment nor a string
@ -655,7 +655,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
        // don't escape...
        sb.append(String.format(Locale.ROOT, "\\u%04x", c));
      }
-      charUpto += Character.charCount(c);
+      charUpto++;
    }
    return sb.toString();
  }
--- a/solr/licenses/icu4j-49.1.jar.sha1
+++ b/solr/licenses/icu4j-49.1.jar.sha1
@ -1 +0,0 @@
-fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
--- a/solr/licenses/icu4j-52.1.jar.sha1
+++ b/solr/licenses/icu4j-52.1.jar.sha1
@ -0,0 +1 @@
+7dbc327670673acd14b487d120f05747d712c1c0