LUCENE-4381: upgrade ICU to icu4j 52.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1547502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-12-03 18:05:23 +00:00
parent 5f5098299a
commit b5dbac5e35
37 changed files with 281 additions and 667 deletions

View File

@ -89,6 +89,8 @@ Build
* LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
(Uwe Schindler)
* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
Bug fixes
* LUCENE-5285: Improved highlighting of multi-valued fields with

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros

View File

@ -14,27 +14,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Default RBBI rules, based on UAX#29.
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
#
# Copyright (C) 2002-2013, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
##############################################################################
#
# Character class definitions from TR 29
#
##############################################################################
!!chain;
#
# Character Class Definitions.
#
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
$Single_Quote = [\p{Word_Break = Single_Quote}];
$Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
# Dictionary character set, for triggering language-based break engines. Currently
@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
# include the dictionary characters.
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
$dictionary = [$ComplexContext];
# leave CJK scripts out of ALetterPlus
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
$KatakanaEx = $Katakana ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
$Hiragana = [\p{script=Hiragana}];
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
@ -77,23 +112,31 @@ $CR $LF;
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s).
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
$HangulSyllable {200};
$Hebrew_LetterEx{200};
$KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtual of being numerically larger.
$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
#
# rule 5
# Do not break between most letters.
#
$ALetterEx $ALetterEx {200};
($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 6 and 7
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 7a
$Hebrew_LetterEx $Single_QuoteEx {200};
# rule 7b and 7c
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
# rule 8
@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
# rule 9
$ALetterEx $NumericEx {200};
($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
# rule 10
$NumericEx $ALetterEx {200};
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
# rule 13
$KatakanaEx $KatakanaEx {300};
# rule 13a/b
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
# rule 13c
$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};

View File

@ -1,61 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This is an example of rule tailoring for Hebrew.
# In this example the single-quote is added to the Extend category
# The double-quote is added to the MidLetter category.
#
!!chain;
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}\u0027];
$Format = [\p{Word_Break = Format}];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}\u0022];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$CR $LF;
[^$CR $LF $Newline]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
$ALetterEx $ALetterEx {200};
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
$NumericEx $NumericEx {100};
$ALetterEx $NumericEx {200};
$NumericEx $ALetterEx {200};
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$ALetterEx $ExtendNumLetEx {200};
$NumericEx $ExtendNumLetEx {100};
$ExtendNumLetEx $ExtendNumLetEx {200};
$ExtendNumLetEx $ALetterEx {200};
$ExtendNumLetEx $NumericEx {100};

View File

@ -1,192 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Parses Lao text, with syllable as token.
#
# The definition of Lao syllable is based from:
#
# Syllabification of Lao Script for Line Breaking
# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
#
# NOTE:
# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
#
# Syllable structure, where X is the nuclear consonant:
#
# +----+
# | X5 |
# +----+
# | X4 |
# +----+----+----+----+----+----+----+-----+
# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
# +----+----+----+----+----+----+----+-----+
# | X2 |
# +----+
# | X3 |
# +----+
#
# X0 represents a vowel which occurs before the nuclear consonant.
# It can always define the beginning of syllable.
$X0 = [\u0EC0-\u0EC4];
# X1 is a combination consonant which comes before the nuclear consonant,
# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
$X1 = [\u0EAB];
# X represents the nuclear consonant.
$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
# X2 is a combination consonant which comes after the nuclear consonant,
# which is placed under or next to the nuclear consonant.
$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
# X3 represents a vowel which occurs under the nuclear consonant.
$X3 = [\u0EB8\u0EB9];
# X4 represents a vowel which occurs above the nuclear consonant.
$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
$X5 = [\u0EC8-\u0ECB];
# X6 represents a consonant vowel, which occurs after the nuclear consonant.
# It functions when the syllable doesnt have any vowels. And it always exists with X8.
$X6 = [\u0EA7\u0EAD\u0EBD];
# X7 represents a final vowel.
# However X7_1 always represents the end of syllable and it never exists with tone mark.
$X7 = [\u0EB0\u0EB2\u0EB3];
# X8 represents an alternate consonant.
$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
# X10 represents a sign mark.
# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
$X10 = [\u0EAF\u0EC6\u0ECC];
# Section 1
$X0_1 = [\u0EC0];
$X4_1_2 = [\u0EB4\u0EB5];
$X4_3_4 = [\u0EB6\u0EB7];
$X4_6 = [\u0EBB];
$X4_7 = [\u0EB1];
$X6_2 = [\u0EAD];
$X6_3 = [\u0EBD];
$X7_1 = [\u0EB0];
$X7_2 = [\u0EB2];
$X10_1 = [\u0EAF];
$X10_2 = [\u0EC6];
$X10_3 = [\u0ECC];
$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
# Section 2
$X0_2 = [\u0EC1];
$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
# Section 3
$X0_3 = [\u0EC2];
$X8_3 = [\u0E8D];
$X8_8 = [\u0EA7];
$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
# Section 4
$X0_4 = [\u0EC4];
$X6_1 = [\u0EA7];
$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 5
$X0_5 = [\u0EC3];
$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 6
$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 7
$X4_1_4 = [\u0EB4-\u0EB7];
$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 8
$X4_5 = [\u0ECD];
$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 9
$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
$Rule9 = ($Rule9_1 | $Rule9_2);
# Section 10
$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 11
$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 12
$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
# Section 13
$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 14
$X7_3 = [\u0EB3];
$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
$WordJoin = [:Line_Break=Word_Joiner:];
$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$LaoJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -78,7 +78,6 @@ FF0D>002D
## Space Folding
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
1680>0020
180E>0020
## Spacing Accents folding (done by kd)

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2012, International Business Machines
# Copyright (C) 1999-2013, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfc.txt
@ -7,7 +7,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 6.1.0
* Unicode 6.3.0
# Canonical_Combining_Class (ccc) values
0300..0314:230

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2012, International Business Machines
# Copyright (C) 1999-2013, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc.txt
@ -11,7 +11,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 6.1.0
* Unicode 6.3.0
00A0>0020
00A8>0020 0308

View File

@ -1,5 +1,5 @@
# Unicode Character Database
# Copyright (c) 1991-2012 Unicode, Inc.
# Copyright (c) 1991-2013 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 6.1.0
* Unicode 6.3.0
0041>0061
0042>0062
@ -537,6 +537,7 @@
0555>0585
0556>0586
0587>0565 0582
061C>
0675>0627 0674
0676>0648 0674
0677>06C7 0674
@ -627,7 +628,7 @@
10FC>10DC
115F..1160>
17B4..17B5>
180B..180D>
180B..180E>
1D2C>0061
1D2D>00E6
1D2E>0062

View File

@ -21,7 +21,6 @@ import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
}
/**
* If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
* treat it like a generic BreakIterator If its any other
* RuleBasedBreakIterator, the rule status can be used for token type. If its
* If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
if (breakIterator instanceof RuleBasedBreakIterator
&& !(breakIterator instanceof DictionaryBasedBreakIterator))
if (breakIterator instanceof RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);

View File

@ -41,12 +41,13 @@ final class CompositeBreakIterator {
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
private BreakIteratorWrapper rbbi;
private final ScriptIterator scriptIterator = new ScriptIterator();
private final ScriptIterator scriptIterator;
private char text[];
CompositeBreakIterator(ICUTokenizerConfig config) {
this.config = config;
this.scriptIterator = new ScriptIterator(config.combineCJ());
}
/**

View File

@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings:
* <ul>
* <li>Thai text is broken into words with a
* {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
* <li>Lao, Myanmar, and Khmer text is broken into syllables
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
* <li>Myanmar, and Khmer text is broken into syllables
* based on custom BreakIterator rules.
* <li>Hebrew text has custom tailorings to handle special cases
* involving punctuation.
* </ul>
* @lucene.experimental
*/
@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
* the default breakiterators in use. these can be expensive to
* instantiate, cheap to clone.
*/
private static final BreakIterator rootBreakIterator =
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
// the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator =
readBreakIterator("Default.brk");
private static final BreakIterator thaiBreakIterator =
BreakIterator.getWordInstance(new ULocale("th_TH"));
private static final BreakIterator hebrewBreakIterator =
readBreakIterator("Hebrew.brk");
private static final BreakIterator khmerBreakIterator =
readBreakIterator("Khmer.brk");
private static final BreakIterator laoBreakIterator =
new LaoBreakIterator(readBreakIterator("Lao.brk"));
private static final BreakIterator myanmarBreakIterator =
readBreakIterator("Myanmar.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
private final boolean cjkAsWords;
/**
* Creates a new config. This object is lightweight, but the first
* time the class is referenced, breakiterators will be initialized.
* @param cjkAsWords true if cjk text should undergo dictionary-based segmentation,
* otherwise text will be segmented according to UAX#29 defaults.
* If this is true, all Han+Hiragana+Katakana words will be tagged as
* IDEOGRAPHIC.
*/
public DefaultICUTokenizerConfig() {}
public DefaultICUTokenizerConfig(boolean cjkAsWords) {
this.cjkAsWords = cjkAsWords;
}
@Override
public boolean combineCJ() {
return cjkAsWords;
}
@Override
public BreakIterator getBreakIterator(int script) {
switch(script) {
case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
default: return (BreakIterator)rootBreakIterator.clone();
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
default: return (BreakIterator)defaultBreakIterator.clone();
}
}

View File

@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
* @see DefaultICUTokenizerConfig
*/
public ICUTokenizer(Reader input) {
this(input, new DefaultICUTokenizerConfig());
this(input, new DefaultICUTokenizerConfig(true));
}
/**

View File

@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig {
/** Return a token type value for a given script and BreakIterator
* rule status. */
public abstract String getType(int script, int ruleStatus);
/** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
public abstract boolean combineCJ();
}

View File

@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
* <pre class="prettyprint" >
* &lt;fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.ICUTokenizerFactory"
* &lt;tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
static final String RULEFILES = "rulefiles";
private final Map<Integer,String> tailored;
private ICUTokenizerConfig config;
private final boolean cjkAsWords;
/** Creates a new ICUTokenizerFactory */
public ICUTokenizerFactory(Map<String,String> args) {
@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
}
}
cjkAsWords = getBoolean(args, "cjkAsWords", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
public void inform(ResourceLoader loader) throws IOException {
assert tailored != null : "init must be called first!";
if (tailored.isEmpty()) {
config = new DefaultICUTokenizerConfig();
config = new DefaultICUTokenizerConfig(cjkAsWords);
} else {
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
String resourcePath = entry.getValue();
breakers[code] = parseRules(resourcePath, loader);
}
config = new DefaultICUTokenizerConfig() {
config = new DefaultICUTokenizerConfig(cjkAsWords) {
@Override
public BreakIterator getBreakIterator(int script) {

View File

@ -1,230 +0,0 @@
package org.apache.lucene.analysis.icu.segmentation;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
/**
* Syllable iterator for Lao text.
* <p>
* This breaks Lao text into syllables according to:
* <i>Syllabification of Lao Script for Line Breaking</i>
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
* <ul>
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
* </ul>
* <p>
* Most work is accomplished with RBBI rules, however some additional special logic is needed
* that cannot be coded in a grammar, and this is implemented here.
* <p>
* For example, what appears to be a final consonant might instead be part of the next syllable.
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
* <p>
* Take for instance the text ກວ່າດອກ
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
* What LaoBreakIterator does, according to the paper:
* <ol>
* <li>backtrack and remove the from the last syllable, placing it on the current syllable.
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
* <li>verify the modified current syllable (ດອກ) is now legal.
* <li>If 2 or 3 fails, then restore the to the last syllable and skip the current character.
* </ol>
* <p>
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
* This is the issue of combining marks being in the wrong order (typos).
* @lucene.experimental
*/
public class LaoBreakIterator extends BreakIterator {
RuleBasedBreakIterator rules;
CharArrayIterator text;
CharArrayIterator working = new CharArrayIterator();
int workingOffset = 0;
CharArrayIterator verifyText = new CharArrayIterator();
RuleBasedBreakIterator verify;
private static final UnicodeSet laoSet;
static {
laoSet = new UnicodeSet("[:Lao:]");
laoSet.compact();
laoSet.freeze();
}
/**
* Creates a new iterator, performing the backtracking verification
* across the provided <code>rules</code>.
*/
public LaoBreakIterator(RuleBasedBreakIterator rules) {
this.rules = (RuleBasedBreakIterator) rules.clone();
this.verify = (RuleBasedBreakIterator) rules.clone();
}
@Override
public int current() {
int current = rules.current();
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
}
@Override
public int first() {
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
int first = rules.first();
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
}
@Override
public int following(int offset) {
throw new UnsupportedOperationException();
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public int last() {
throw new UnsupportedOperationException();
}
@Override
public int next() {
int current = current();
int next = rules.next();
if (next == BreakIterator.DONE)
return next;
else
next += workingOffset;
char c = working.current();
int following = rules.next(); // lookahead
if (following != BreakIterator.DONE) {
following += workingOffset;
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
workingOffset = next - 1;
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
return next - 1;
}
rules.previous(); // undo the lookahead
}
return next;
}
@Override
public int next(int n) {
if (n < 0)
throw new UnsupportedOperationException("Backwards traversal is unsupported");
int result = current();
while (n > 0) {
result = next();
--n;
}
return result;
}
@Override
public int previous() {
throw new UnsupportedOperationException("Backwards traversal is unsupported");
}
@Override
public void setText(CharacterIterator text) {
if (!(text instanceof CharArrayIterator))
throw new UnsupportedOperationException("unsupported CharacterIterator");
this.text = (CharArrayIterator) text;
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
}
@Override
public void setText(String newText) {
CharArrayIterator ci = new CharArrayIterator();
ci.setText(newText.toCharArray(), 0, newText.length());
setText(ci);
}
private boolean verifyPushBack(int current, int next) {
int shortenedSyllable = next - current - 1;
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
verify.setText(verifyText);
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
return false;
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
verify.setText(verifyText);
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
}
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
private void ccReorder(char[] text, int start, int length) {
boolean reordered;
do {
int prevCC = 0;
reordered = false;
for (int i = start; i < start + length; i++) {
final char c = text[i];
final int cc = UCharacter.getCombiningClass(c);
if (cc > 0 && cc < prevCC) {
// swap
text[i] = text[i - 1];
text[i - 1] = c;
reordered = true;
} else {
prevCC = cc;
}
}
} while (reordered == true);
}
/**
* Clone method. Creates another LaoBreakIterator with the same behavior
* and current state as this one.
* @return The clone.
*/
@Override
public LaoBreakIterator clone() {
LaoBreakIterator other = (LaoBreakIterator) super.clone();
other.rules = (RuleBasedBreakIterator) rules.clone();
other.verify = (RuleBasedBreakIterator) verify.clone();
if (text != null)
other.text = text.clone();
if (working != null)
other.working = working.clone();
if (verifyText != null)
other.verifyText = verifyText.clone();
return other;
}
}

View File

@ -59,6 +59,15 @@ final class ScriptIterator {
private int scriptStart;
private int scriptLimit;
private int scriptCode;
private final boolean combineCJ;
/**
* @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
*/
ScriptIterator(boolean combineCJ) {
this.combineCJ = combineCJ;
}
/**
* Get the start of this script run
@ -162,10 +171,24 @@ final class ScriptIterator {
}
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private static int getScript(int codepoint) {
if (0 <= codepoint && codepoint < basicLatin.length)
private int getScript(int codepoint) {
if (0 <= codepoint && codepoint < basicLatin.length) {
return basicLatin[codepoint];
else
return UScript.getScript(codepoint);
} else {
int script = UScript.getScript(codepoint);
if (combineCJ) {
if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
return UScript.JAPANESE;
} else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
// they are treated as punctuation. we currently have no cleaner way to fix this!
return UScript.LATIN;
} else {
return script;
}
} else {
return script;
}
}
}
}

View File

@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribut
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(ScriptAttribute.class, "script", getName());
// when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
// mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
// but this is just to help prevent confusion.
String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
reflector.reflect(ScriptAttribute.class, "script", name);
}
}

View File

@ -353,7 +353,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
the most recent version of Unicode (currently 6.1). However, some users who wish
the most recent version of Unicode (currently 6.3). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.

View File

@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@ -52,7 +52,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
sb.append('a');
}
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
char token[] = new char[4096];
Arrays.fill(token, 'a');
String expectedToken = new String(token);
@ -69,7 +69,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer tokenizer = new ICUTokenizer(reader);
Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
@ -118,6 +118,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testLao() throws Exception {
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
}
public void testThai() throws Exception {
@ -138,6 +139,13 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
new String[] { "", "", "", "", "", "1234", "tests"});
}
public void testHebrew() throws Exception {
assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח",
new String[] { "דנקנר", "תקף", "את", "הדו\"ח" });
assertAnalyzesTo(a, "חברת בת של מודי'ס",
new String[] { "חברת", "בת", "של", "מודי'ס" });
}
public void testEmpty() throws Exception {
assertAnalyzesTo(a, "", new String[] {});
assertAnalyzesTo(a, ".", new String[] {});

View File

@ -0,0 +1,91 @@
package org.apache.lucene.analysis.icu.segmentation;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
/**
* test ICUTokenizer with dictionary-based CJ segmentation
*/
public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new ICUTokenizer(reader));
}
};
/**
* test stolen from smartcn
*/
public void testSimpleChinese() throws Exception {
assertAnalyzesTo(a, "我购买了道具和服装。",
new String[] { "", "购买", "", "道具", "", "服装" }
);
}
public void testChineseNumerics() throws Exception {
assertAnalyzesTo(a, "", new String[] { "" });
assertAnalyzesTo(a, "院內分機9483。",
new String[] { "", "", "分機", "" });
assertAnalyzesTo(a, "院內分機9483。",
new String[] { "", "", "分機", "9483" });
}
/**
* test stolen from kuromoji
*/
public void testSimpleJapanese() throws Exception {
assertAnalyzesTo(a, "それはまだ実験段階にあります",
new String[] { "それ", "", "まだ", "実験", "段階", "", "あり", "ます" }
);
}
public void testJapaneseTypes() throws Exception {
assertAnalyzesTo(a, "仮名遣い カタカナ",
new String[] { "仮名遣い", "カタカナ" },
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
public void testKorean() throws Exception {
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
/** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */
public void testKoreanTypes() throws Exception {
assertAnalyzesTo(a, "훈민정음",
new String[] { "훈민정음" },
new String[] { "<HANGUL>" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -1,90 +0,0 @@
package org.apache.lucene.analysis.icu.segmentation;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStream;
import org.apache.lucene.util.LuceneTestCase;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
/**
* Tests LaoBreakIterator and its RBBI rules
*/
public class TestLaoBreakIterator extends LuceneTestCase {
private BreakIterator wordIterator;
@Override
public void setUp() throws Exception {
super.setUp();
InputStream is = getClass().getResourceAsStream("Lao.brk");
wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
is.close();
}
private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
char text[] = sourceText.toCharArray();
CharArrayIterator ci = new CharArrayIterator();
ci.setText(text, 0, text.length);
iterator.setText(ci);
for (int i = 0; i < tokens.length; i++) {
int start, end;
do {
start = iterator.current();
end = iterator.next();
} while (end != BreakIterator.DONE && !isWord(text, start, end));
assertTrue(start != BreakIterator.DONE);
assertTrue(end != BreakIterator.DONE);
assertEquals(tokens[i], new String(text, start, end - start));
}
assertTrue(iterator.next() == BreakIterator.DONE);
}
protected boolean isWord(char text[], int start, int end) {
int codepoint;
for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
codepoint = UTF16.charAt(text, 0, end, start);
if (UCharacter.isLetterOrDigit(codepoint))
return true;
}
return false;
}
public void testBasicUsage() throws Exception {
assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertBreaksTo(wordIterator, "ຜູ້​ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
assertBreaksTo(wordIterator, "", new String[] {});
assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
}
public void testNumerics() throws Exception {
assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
}
public void testTextAndNumerics() throws Exception {
assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
}
}

View File

@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new ICUTokenizer(reader);
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
}
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new ICUTokenizer(reader);
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);

View File

@ -62,7 +62,7 @@ import java.util.regex.Pattern;
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-49-1-2";
private static final String ICU_RELEASE_TAG = "release-52-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";

View File

@ -46,7 +46,7 @@ com.google.inject.guice.version = 3.0
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
/com.googlecode.mp4parser/isoparser = 1.0-RC-1
/com.ibm.icu/icu4j = 49.1
/com.ibm.icu/icu4j = 52.1
/com.spatial4j/spatial4j = 0.3
com.sun.jersey.version = 1.8

View File

@ -1 +0,0 @@
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c

View File

@ -0,0 +1 @@
7dbc327670673acd14b487d120f05747d712c1c0

View File

@ -635,7 +635,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
int charUpto = 0;
final StringBuilder sb = new StringBuilder();
while (charUpto < s.length()) {
final int c = s.codePointAt(charUpto);
final int c = s.charAt(charUpto);
if (c == 0xa) {
// Strangely, you cannot put \ u000A into Java
// sources (not in a comment nor a string
@ -655,7 +655,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// don't escape...
sb.append(String.format(Locale.ROOT, "\\u%04x", c));
}
charUpto += Character.charCount(c);
charUpto++;
}
return sb.toString();
}

View File

@ -1 +0,0 @@
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c

View File

@ -0,0 +1 @@
7dbc327670673acd14b487d120f05747d712c1c0