mirror of https://github.com/apache/lucene.git
LUCENE-4381: upgrade ICU to icu4j 52.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1547502 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5f5098299a
commit
b5dbac5e35
|
@ -89,6 +89,8 @@ Build
|
|||
* LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
|
||||
(Uwe Schindler)
|
||||
|
||||
* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-5285: Improved highlighting of multi-valued fields with
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 49.1.0.0
|
||||
// Generated using ICU4J 52.1.0.0
|
||||
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
|
||||
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 49.1.0.0
|
||||
// Generated using ICU4J 52.1.0.0
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
|
|
|
@ -14,27 +14,52 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Default RBBI rules, based on UAX#29.
|
||||
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
|
||||
#
|
||||
# Copyright (C) 2002-2013, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
#
|
||||
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
|
@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
|||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
||||
# include the dictionary characters.
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
|
||||
$dictionary = [$ComplexContext];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
||||
|
||||
$Hiragana = [\p{script=Hiragana}];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
|
@ -77,23 +112,31 @@ $CR $LF;
|
|||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s).
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_LetterEx{200};
|
||||
$KatakanaEx {300}; # note: these status values override those from rule 5
|
||||
$HiraganaEx {300}; # by virtual of being numerically larger.
|
||||
$HiraganaEx {300}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
$ALetterEx $ALetterEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 6 and 7
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_LetterEx $Single_QuoteEx {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
|
||||
|
||||
# rule 8
|
||||
|
||||
|
@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
|
|||
|
||||
# rule 9
|
||||
|
||||
$ALetterEx $NumericEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx $ALetterEx {200};
|
||||
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx $KatakanaEx {300};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||
|
||||
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||
$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
|
||||
# rule 13c
|
||||
|
||||
$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# This is an example of rule tailoring for Hebrew.
|
||||
# In this example the single-quote is added to the Extend category
|
||||
# The double-quote is added to the MidLetter category.
|
||||
#
|
||||
!!chain;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}\u0027];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}\u0022];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
|
||||
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$ALetterEx $ALetterEx {200};
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
$NumericEx $NumericEx {100};
|
||||
$ALetterEx $NumericEx {200};
|
||||
$NumericEx $ALetterEx {200};
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
$ALetterEx $ExtendNumLetEx {200};
|
||||
$NumericEx $ExtendNumLetEx {100};
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200};
|
||||
$ExtendNumLetEx $ALetterEx {200};
|
||||
$ExtendNumLetEx $NumericEx {100};
|
|
@ -1,192 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Parses Lao text, with syllable as token.
|
||||
#
|
||||
# The definition of Lao syllable is based from:
|
||||
#
|
||||
# Syllabification of Lao Script for Line Breaking
|
||||
# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
|
||||
# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
|
||||
# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
|
||||
# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
|
||||
#
|
||||
# NOTE:
|
||||
# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
|
||||
# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
|
||||
#
|
||||
# Syllable structure, where X is the nuclear consonant:
|
||||
#
|
||||
# +----+
|
||||
# | X5 |
|
||||
# +----+
|
||||
# | X4 |
|
||||
# +----+----+----+----+----+----+----+-----+
|
||||
# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
|
||||
# +----+----+----+----+----+----+----+-----+
|
||||
# | X2 |
|
||||
# +----+
|
||||
# | X3 |
|
||||
# +----+
|
||||
#
|
||||
# X0 represents a vowel which occurs before the nuclear consonant.
|
||||
# It can always define the beginning of syllable.
|
||||
$X0 = [\u0EC0-\u0EC4];
|
||||
# X1 is a combination consonant which comes before the nuclear consonant,
|
||||
# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
|
||||
$X1 = [\u0EAB];
|
||||
# X represents the nuclear consonant.
|
||||
$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
|
||||
# X2 is a combination consonant which comes after the nuclear consonant,
|
||||
# which is placed under or next to the nuclear consonant.
|
||||
$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
|
||||
# X3 represents a vowel which occurs under the nuclear consonant.
|
||||
$X3 = [\u0EB8\u0EB9];
|
||||
# X4 represents a vowel which occurs above the nuclear consonant.
|
||||
$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
|
||||
# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
|
||||
$X5 = [\u0EC8-\u0ECB];
|
||||
# X6 represents a consonant vowel, which occurs after the nuclear consonant.
|
||||
# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
|
||||
$X6 = [\u0EA7\u0EAD\u0EBD];
|
||||
# X7 represents a final vowel.
|
||||
# However X7_1 always represents the end of syllable and it never exists with tone mark.
|
||||
$X7 = [\u0EB0\u0EB2\u0EB3];
|
||||
# X8 represents an alternate consonant.
|
||||
$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
|
||||
# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
|
||||
$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
|
||||
# X10 represents a sign mark.
|
||||
# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
|
||||
$X10 = [\u0EAF\u0EC6\u0ECC];
|
||||
|
||||
# Section 1
|
||||
$X0_1 = [\u0EC0];
|
||||
$X4_1_2 = [\u0EB4\u0EB5];
|
||||
$X4_3_4 = [\u0EB6\u0EB7];
|
||||
$X4_6 = [\u0EBB];
|
||||
$X4_7 = [\u0EB1];
|
||||
$X6_2 = [\u0EAD];
|
||||
$X6_3 = [\u0EBD];
|
||||
$X7_1 = [\u0EB0];
|
||||
$X7_2 = [\u0EB2];
|
||||
$X10_1 = [\u0EAF];
|
||||
$X10_2 = [\u0EC6];
|
||||
$X10_3 = [\u0ECC];
|
||||
|
||||
$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
|
||||
$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
|
||||
|
||||
# Section 2
|
||||
$X0_2 = [\u0EC1];
|
||||
|
||||
$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
|
||||
$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
|
||||
|
||||
# Section 3
|
||||
$X0_3 = [\u0EC2];
|
||||
$X8_3 = [\u0E8D];
|
||||
$X8_8 = [\u0EA7];
|
||||
|
||||
$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
|
||||
$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
|
||||
|
||||
$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
|
||||
|
||||
# Section 4
|
||||
$X0_4 = [\u0EC4];
|
||||
$X6_1 = [\u0EA7];
|
||||
|
||||
$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 5
|
||||
$X0_5 = [\u0EC3];
|
||||
|
||||
$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 6
|
||||
$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 7
|
||||
$X4_1_4 = [\u0EB4-\u0EB7];
|
||||
|
||||
$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 8
|
||||
$X4_5 = [\u0ECD];
|
||||
|
||||
$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 9
|
||||
|
||||
$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
|
||||
|
||||
$Rule9 = ($Rule9_1 | $Rule9_2);
|
||||
|
||||
# Section 10
|
||||
$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 11
|
||||
$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 12
|
||||
$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
|
||||
|
||||
# Section 13
|
||||
$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 14
|
||||
$X7_3 = [\u0EB3];
|
||||
|
||||
$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
|
||||
$LaoJoinedSyllableEx {200};
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -78,7 +78,6 @@ FF0D>002D
|
|||
## Space Folding
|
||||
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
|
||||
1680>0020
|
||||
180E>0020
|
||||
|
||||
## Spacing Accents folding (done by kd)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 1999-2012, International Business Machines
|
||||
# Copyright (C) 1999-2013, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfc.txt
|
||||
|
@ -7,7 +7,7 @@
|
|||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode 6.1.0
|
||||
* Unicode 6.3.0
|
||||
|
||||
# Canonical_Combining_Class (ccc) values
|
||||
0300..0314:230
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 1999-2012, International Business Machines
|
||||
# Copyright (C) 1999-2013, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfkc.txt
|
||||
|
@ -11,7 +11,7 @@
|
|||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode 6.1.0
|
||||
* Unicode 6.3.0
|
||||
|
||||
00A0>0020
|
||||
00A8>0020 0308
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2012 Unicode, Inc.
|
||||
# Copyright (c) 1991-2013 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
|
@ -12,7 +12,7 @@
|
|||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
* Unicode 6.1.0
|
||||
* Unicode 6.3.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
|
@ -537,6 +537,7 @@
|
|||
0555>0585
|
||||
0556>0586
|
||||
0587>0565 0582
|
||||
061C>
|
||||
0675>0627 0674
|
||||
0676>0648 0674
|
||||
0677>06C7 0674
|
||||
|
@ -627,7 +628,7 @@
|
|||
10FC>10DC
|
||||
115F..1160>
|
||||
17B4..17B5>
|
||||
180B..180D>
|
||||
180B..180E>
|
||||
1D2C>0061
|
||||
1D2D>00E6
|
||||
1D2E>0062
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.text.CharacterIterator;
|
|||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.DictionaryBasedBreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
|
@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
|
|||
}
|
||||
|
||||
/**
|
||||
* If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
|
||||
* treat it like a generic BreakIterator If its any other
|
||||
* RuleBasedBreakIterator, the rule status can be used for token type. If its
|
||||
* If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
|
||||
* any other BreakIterator, the rulestatus method is not available, so treat
|
||||
* it like a generic BreakIterator.
|
||||
*/
|
||||
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
|
||||
if (breakIterator instanceof RuleBasedBreakIterator
|
||||
&& !(breakIterator instanceof DictionaryBasedBreakIterator))
|
||||
if (breakIterator instanceof RuleBasedBreakIterator)
|
||||
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
|
||||
else
|
||||
return new BIWrapper(breakIterator);
|
||||
|
|
|
@ -41,12 +41,13 @@ final class CompositeBreakIterator {
|
|||
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
|
||||
|
||||
private BreakIteratorWrapper rbbi;
|
||||
private final ScriptIterator scriptIterator = new ScriptIterator();
|
||||
private final ScriptIterator scriptIterator;
|
||||
|
||||
private char text[];
|
||||
|
||||
CompositeBreakIterator(ICUTokenizerConfig config) {
|
||||
this.config = config;
|
||||
this.scriptIterator = new ScriptIterator(config.combineCJ());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
|
|||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||
* but with the following tailorings:
|
||||
* <ul>
|
||||
* <li>Thai text is broken into words with a
|
||||
* {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
|
||||
* <li>Lao, Myanmar, and Khmer text is broken into syllables
|
||||
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
|
||||
* <li>Myanmar, and Khmer text is broken into syllables
|
||||
* based on custom BreakIterator rules.
|
||||
* <li>Hebrew text has custom tailorings to handle special cases
|
||||
* involving punctuation.
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
* the default breakiterators in use. these can be expensive to
|
||||
* instantiate, cheap to clone.
|
||||
*/
|
||||
private static final BreakIterator rootBreakIterator =
|
||||
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
|
||||
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
|
||||
private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
|
||||
// the same as ROOT, except no dictionary segmentation for cjk
|
||||
private static final BreakIterator defaultBreakIterator =
|
||||
readBreakIterator("Default.brk");
|
||||
private static final BreakIterator thaiBreakIterator =
|
||||
BreakIterator.getWordInstance(new ULocale("th_TH"));
|
||||
private static final BreakIterator hebrewBreakIterator =
|
||||
readBreakIterator("Hebrew.brk");
|
||||
private static final BreakIterator khmerBreakIterator =
|
||||
readBreakIterator("Khmer.brk");
|
||||
private static final BreakIterator laoBreakIterator =
|
||||
new LaoBreakIterator(readBreakIterator("Lao.brk"));
|
||||
private static final BreakIterator myanmarBreakIterator =
|
||||
readBreakIterator("Myanmar.brk");
|
||||
|
||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||
private final boolean cjkAsWords;
|
||||
|
||||
/**
|
||||
* Creates a new config. This object is lightweight, but the first
|
||||
* time the class is referenced, breakiterators will be initialized.
|
||||
* @param cjkAsWords true if cjk text should undergo dictionary-based segmentation,
|
||||
* otherwise text will be segmented according to UAX#29 defaults.
|
||||
* If this is true, all Han+Hiragana+Katakana words will be tagged as
|
||||
* IDEOGRAPHIC.
|
||||
*/
|
||||
public DefaultICUTokenizerConfig() {}
|
||||
public DefaultICUTokenizerConfig(boolean cjkAsWords) {
|
||||
this.cjkAsWords = cjkAsWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean combineCJ() {
|
||||
return cjkAsWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
switch(script) {
|
||||
case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
|
||||
case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
|
||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
||||
case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
|
||||
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
|
||||
default: return (BreakIterator)rootBreakIterator.clone();
|
||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
* @see DefaultICUTokenizerConfig
|
||||
*/
|
||||
public ICUTokenizer(Reader input) {
|
||||
this(input, new DefaultICUTokenizerConfig());
|
||||
this(input, new DefaultICUTokenizerConfig(true));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig {
|
|||
/** Return a token type value for a given script and BreakIterator
|
||||
* rule status. */
|
||||
public abstract String getType(int script, int ruleStatus);
|
||||
/** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
|
||||
public abstract boolean combineCJ();
|
||||
}
|
||||
|
|
|
@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
|
|||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.ICUTokenizerFactory"
|
||||
* <tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
|
||||
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
|
@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
static final String RULEFILES = "rulefiles";
|
||||
private final Map<Integer,String> tailored;
|
||||
private ICUTokenizerConfig config;
|
||||
private final boolean cjkAsWords;
|
||||
|
||||
/** Creates a new ICUTokenizerFactory */
|
||||
public ICUTokenizerFactory(Map<String,String> args) {
|
||||
|
@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
|
||||
}
|
||||
}
|
||||
cjkAsWords = getBoolean(args, "cjkAsWords", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
public void inform(ResourceLoader loader) throws IOException {
|
||||
assert tailored != null : "init must be called first!";
|
||||
if (tailored.isEmpty()) {
|
||||
config = new DefaultICUTokenizerConfig();
|
||||
config = new DefaultICUTokenizerConfig(cjkAsWords);
|
||||
} else {
|
||||
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
|
||||
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
|
||||
|
@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
String resourcePath = entry.getValue();
|
||||
breakers[code] = parseRules(resourcePath, loader);
|
||||
}
|
||||
config = new DefaultICUTokenizerConfig() {
|
||||
config = new DefaultICUTokenizerConfig(cjkAsWords) {
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
|
|
|
@ -1,230 +0,0 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Syllable iterator for Lao text.
|
||||
* <p>
|
||||
* This breaks Lao text into syllables according to:
|
||||
* <i>Syllabification of Lao Script for Line Breaking</i>
|
||||
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
|
||||
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
|
||||
* <ul>
|
||||
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
|
||||
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
|
||||
* </ul>
|
||||
* <p>
|
||||
* Most work is accomplished with RBBI rules, however some additional special logic is needed
|
||||
* that cannot be coded in a grammar, and this is implemented here.
|
||||
* <p>
|
||||
* For example, what appears to be a final consonant might instead be part of the next syllable.
|
||||
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
|
||||
* <p>
|
||||
* Take for instance the text ກວ່າດອກ
|
||||
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
|
||||
* What LaoBreakIterator does, according to the paper:
|
||||
* <ol>
|
||||
* <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
|
||||
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
|
||||
* <li>verify the modified current syllable (ດອກ) is now legal.
|
||||
* <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
|
||||
* </ol>
|
||||
* <p>
|
||||
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
|
||||
* This is the issue of combining marks being in the wrong order (typos).
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LaoBreakIterator extends BreakIterator {
|
||||
RuleBasedBreakIterator rules;
|
||||
CharArrayIterator text;
|
||||
|
||||
CharArrayIterator working = new CharArrayIterator();
|
||||
int workingOffset = 0;
|
||||
|
||||
CharArrayIterator verifyText = new CharArrayIterator();
|
||||
RuleBasedBreakIterator verify;
|
||||
|
||||
private static final UnicodeSet laoSet;
|
||||
static {
|
||||
laoSet = new UnicodeSet("[:Lao:]");
|
||||
laoSet.compact();
|
||||
laoSet.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new iterator, performing the backtracking verification
|
||||
* across the provided <code>rules</code>.
|
||||
*/
|
||||
public LaoBreakIterator(RuleBasedBreakIterator rules) {
|
||||
this.rules = (RuleBasedBreakIterator) rules.clone();
|
||||
this.verify = (RuleBasedBreakIterator) rules.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
int current = rules.current();
|
||||
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
rules.setText(working);
|
||||
workingOffset = 0;
|
||||
int first = rules.first();
|
||||
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int offset) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
int current = current();
|
||||
int next = rules.next();
|
||||
if (next == BreakIterator.DONE)
|
||||
return next;
|
||||
else
|
||||
next += workingOffset;
|
||||
|
||||
char c = working.current();
|
||||
int following = rules.next(); // lookahead
|
||||
if (following != BreakIterator.DONE) {
|
||||
following += workingOffset;
|
||||
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
|
||||
workingOffset = next - 1;
|
||||
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
|
||||
return next - 1;
|
||||
}
|
||||
rules.previous(); // undo the lookahead
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
if (n < 0)
|
||||
throw new UnsupportedOperationException("Backwards traversal is unsupported");
|
||||
|
||||
int result = current();
|
||||
while (n > 0) {
|
||||
result = next();
|
||||
--n;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
throw new UnsupportedOperationException("Backwards traversal is unsupported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator text) {
|
||||
if (!(text instanceof CharArrayIterator))
|
||||
throw new UnsupportedOperationException("unsupported CharacterIterator");
|
||||
this.text = (CharArrayIterator) text;
|
||||
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
rules.setText(working);
|
||||
workingOffset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(String newText) {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(newText.toCharArray(), 0, newText.length());
|
||||
setText(ci);
|
||||
}
|
||||
|
||||
private boolean verifyPushBack(int current, int next) {
|
||||
int shortenedSyllable = next - current - 1;
|
||||
|
||||
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
|
||||
verify.setText(verifyText);
|
||||
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
|
||||
return false;
|
||||
|
||||
|
||||
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
|
||||
verify.setText(verifyText);
|
||||
|
||||
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
|
||||
}
|
||||
|
||||
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
|
||||
private void ccReorder(char[] text, int start, int length) {
|
||||
boolean reordered;
|
||||
do {
|
||||
int prevCC = 0;
|
||||
reordered = false;
|
||||
for (int i = start; i < start + length; i++) {
|
||||
final char c = text[i];
|
||||
final int cc = UCharacter.getCombiningClass(c);
|
||||
if (cc > 0 && cc < prevCC) {
|
||||
// swap
|
||||
text[i] = text[i - 1];
|
||||
text[i - 1] = c;
|
||||
reordered = true;
|
||||
} else {
|
||||
prevCC = cc;
|
||||
}
|
||||
}
|
||||
|
||||
} while (reordered == true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone method. Creates another LaoBreakIterator with the same behavior
|
||||
* and current state as this one.
|
||||
* @return The clone.
|
||||
*/
|
||||
@Override
|
||||
public LaoBreakIterator clone() {
|
||||
LaoBreakIterator other = (LaoBreakIterator) super.clone();
|
||||
other.rules = (RuleBasedBreakIterator) rules.clone();
|
||||
other.verify = (RuleBasedBreakIterator) verify.clone();
|
||||
if (text != null)
|
||||
other.text = text.clone();
|
||||
if (working != null)
|
||||
other.working = working.clone();
|
||||
if (verifyText != null)
|
||||
other.verifyText = verifyText.clone();
|
||||
return other;
|
||||
}
|
||||
}
|
|
@ -59,6 +59,15 @@ final class ScriptIterator {
|
|||
private int scriptStart;
|
||||
private int scriptLimit;
|
||||
private int scriptCode;
|
||||
|
||||
private final boolean combineCJ;
|
||||
|
||||
/**
|
||||
* @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
|
||||
*/
|
||||
ScriptIterator(boolean combineCJ) {
|
||||
this.combineCJ = combineCJ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the start of this script run
|
||||
|
@ -162,10 +171,24 @@ final class ScriptIterator {
|
|||
}
|
||||
|
||||
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
|
||||
private static int getScript(int codepoint) {
|
||||
if (0 <= codepoint && codepoint < basicLatin.length)
|
||||
private int getScript(int codepoint) {
|
||||
if (0 <= codepoint && codepoint < basicLatin.length) {
|
||||
return basicLatin[codepoint];
|
||||
else
|
||||
return UScript.getScript(codepoint);
|
||||
} else {
|
||||
int script = UScript.getScript(codepoint);
|
||||
if (combineCJ) {
|
||||
if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
|
||||
return UScript.JAPANESE;
|
||||
} else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
|
||||
// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
|
||||
// they are treated as punctuation. we currently have no cleaner way to fix this!
|
||||
return UScript.LATIN;
|
||||
} else {
|
||||
return script;
|
||||
}
|
||||
} else {
|
||||
return script;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribut
|
|||
|
||||
@Override
|
||||
public void reflectWith(AttributeReflector reflector) {
|
||||
reflector.reflect(ScriptAttribute.class, "script", getName());
|
||||
// when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
|
||||
// mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
|
||||
// but this is just to help prevent confusion.
|
||||
String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
|
||||
reflector.reflect(ScriptAttribute.class, "script", name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -353,7 +353,7 @@ and
|
|||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
the most recent version of Unicode (currently 6.1). However, some users who wish
|
||||
the most recent version of Unicode (currently 6.3). However, some users who wish
|
||||
for stronger backwards compatibility can restrict
|
||||
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
||||
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
|
||||
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append('a');
|
||||
}
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
|
||||
char token[] = new char[4096];
|
||||
Arrays.fill(token, 'a');
|
||||
String expectedToken = new String(token);
|
||||
|
@ -69,7 +69,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader);
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
|
||||
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
@ -118,6 +118,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testLao() throws Exception {
|
||||
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testThai() throws Exception {
|
||||
|
@ -138,6 +139,13 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
|
||||
}
|
||||
|
||||
public void testHebrew() throws Exception {
|
||||
assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח",
|
||||
new String[] { "דנקנר", "תקף", "את", "הדו\"ח" });
|
||||
assertAnalyzesTo(a, "חברת בת של מודי'ס",
|
||||
new String[] { "חברת", "בת", "של", "מודי'ס" });
|
||||
}
|
||||
|
||||
public void testEmpty() throws Exception {
|
||||
assertAnalyzesTo(a, "", new String[] {});
|
||||
assertAnalyzesTo(a, ".", new String[] {});
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
/**
|
||||
* test ICUTokenizer with dictionary-based CJ segmentation
|
||||
*/
|
||||
public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new ICUTokenizer(reader));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* test stolen from smartcn
|
||||
*/
|
||||
public void testSimpleChinese() throws Exception {
|
||||
assertAnalyzesTo(a, "我购买了道具和服装。",
|
||||
new String[] { "我", "购买", "了", "道具", "和", "服装" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testChineseNumerics() throws Exception {
|
||||
assertAnalyzesTo(a, "9483", new String[] { "9483" });
|
||||
assertAnalyzesTo(a, "院內分機9483。",
|
||||
new String[] { "院", "內", "分機", "9483" });
|
||||
assertAnalyzesTo(a, "院內分機9483。",
|
||||
new String[] { "院", "內", "分機", "9483" });
|
||||
}
|
||||
|
||||
/**
|
||||
* test stolen from kuromoji
|
||||
*/
|
||||
public void testSimpleJapanese() throws Exception {
|
||||
assertAnalyzesTo(a, "それはまだ実験段階にあります",
|
||||
new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testJapaneseTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "仮名遣い カタカナ",
|
||||
new String[] { "仮名遣い", "カタカナ" },
|
||||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
|
||||
}
|
||||
|
||||
public void testKorean() throws Exception {
|
||||
// Korean words
|
||||
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
/** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */
|
||||
public void testKoreanTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "훈민정음",
|
||||
new String[] { "훈민정음" },
|
||||
new String[] { "<HANGUL>" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* Tests LaoBreakIterator and its RBBI rules
|
||||
*/
|
||||
public class TestLaoBreakIterator extends LuceneTestCase {
|
||||
private BreakIterator wordIterator;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
InputStream is = getClass().getResourceAsStream("Lao.brk");
|
||||
wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
|
||||
is.close();
|
||||
}
|
||||
|
||||
private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
|
||||
char text[] = sourceText.toCharArray();
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(text, 0, text.length);
|
||||
iterator.setText(ci);
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
int start, end;
|
||||
do {
|
||||
start = iterator.current();
|
||||
end = iterator.next();
|
||||
} while (end != BreakIterator.DONE && !isWord(text, start, end));
|
||||
assertTrue(start != BreakIterator.DONE);
|
||||
assertTrue(end != BreakIterator.DONE);
|
||||
assertEquals(tokens[i], new String(text, start, end - start));
|
||||
}
|
||||
|
||||
assertTrue(iterator.next() == BreakIterator.DONE);
|
||||
}
|
||||
|
||||
protected boolean isWord(char text[], int start, int end) {
|
||||
int codepoint;
|
||||
for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
|
||||
codepoint = UTF16.charAt(text, 0, end, start);
|
||||
|
||||
if (UCharacter.isLetterOrDigit(codepoint))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void testBasicUsage() throws Exception {
|
||||
assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
assertBreaksTo(wordIterator, "ຜູ້ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
|
||||
assertBreaksTo(wordIterator, "", new String[] {});
|
||||
assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
|
||||
}
|
||||
|
||||
public void testNumerics() throws Exception {
|
||||
assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
|
||||
assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
|
||||
}
|
||||
|
||||
public void testTextAndNumerics() throws Exception {
|
||||
assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
|
||||
}
|
||||
}
|
|
@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new ICUTokenizer(reader);
|
||||
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
|
||||
TokenStream result = new CJKBigramFilter(source);
|
||||
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer2 = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new ICUTokenizer(reader);
|
||||
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
|
||||
// we put this before the CJKBigramFilter, because the normalization might combine
|
||||
// some halfwidth katakana forms, which will affect the bigramming.
|
||||
TokenStream result = new ICUNormalizer2Filter(source);
|
||||
|
|
|
@ -62,7 +62,7 @@ import java.util.regex.Pattern;
|
|||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_SVN_TAG_URL
|
||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-49-1-2";
|
||||
private static final String ICU_RELEASE_TAG = "release-52-1";
|
||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
|
|
|
@ -46,7 +46,7 @@ com.google.inject.guice.version = 3.0
|
|||
/com.googlecode.concurrentlinkedhashmap/concurrentlinkedhashmap-lru = 1.2
|
||||
/com.googlecode.juniversalchardet/juniversalchardet = 1.0.3
|
||||
/com.googlecode.mp4parser/isoparser = 1.0-RC-1
|
||||
/com.ibm.icu/icu4j = 49.1
|
||||
/com.ibm.icu/icu4j = 52.1
|
||||
/com.spatial4j/spatial4j = 0.3
|
||||
|
||||
com.sun.jersey.version = 1.8
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
|
|
@ -0,0 +1 @@
|
|||
7dbc327670673acd14b487d120f05747d712c1c0
|
|
@ -635,7 +635,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
int charUpto = 0;
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (charUpto < s.length()) {
|
||||
final int c = s.codePointAt(charUpto);
|
||||
final int c = s.charAt(charUpto);
|
||||
if (c == 0xa) {
|
||||
// Strangely, you cannot put \ u000A into Java
|
||||
// sources (not in a comment nor a string
|
||||
|
@ -655,7 +655,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
// don't escape...
|
||||
sb.append(String.format(Locale.ROOT, "\\u%04x", c));
|
||||
}
|
||||
charUpto += Character.charCount(c);
|
||||
charUpto++;
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
|
|
@ -0,0 +1 @@
|
|||
7dbc327670673acd14b487d120f05747d712c1c0
|
Loading…
Reference in New Issue