diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index b2dda552ffa..0075761b963 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -140,6 +140,11 @@ New features * LUCENE-2298: Add analyzers/stempel, an algorithmic stemmer with support for the Polish language. (Andrzej Bialecki via Robert Muir) + * LUCENE-2414: Add ICUTokenizer, a tailorable tokenizer that implements Unicode + Text Segmentation. This tokenizer is useful for documents or collections with + multiple languages. The default configuration includes special support for + Thai, Lao, Myanmar, and Khmer. (Robert Muir, Uwe Schindler) + Build * LUCENE-2124: Moved the JDK-based collation support from contrib/collation diff --git a/lucene/contrib/icu/build.xml b/lucene/contrib/icu/build.xml index af0794c8919..9bab9a11311 100644 --- a/lucene/contrib/icu/build.xml +++ b/lucene/contrib/icu/build.xml @@ -43,7 +43,39 @@ Warning: only works on a big-endian platform! - + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi b/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi new file mode 100644 index 00000000000..744bf0d87fd --- /dev/null +++ b/lucene/contrib/icu/src/data/uax29/Hebrew.rbbi @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# This is an example of rule tailoring for Hebrew. +# In this example the single-quote is added to the Extend category +# The double-quote is added to the MidLetter category. +# +!!chain; +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}\u0027]; +$Format = [\p{Word_Break = Format}]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidLetter = [\p{Word_Break = MidLetter}\u0022]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; + +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +!!forward; + +$CR $LF; +[^$CR $LF $Newline]? ($Extend | $Format)+; +$NumericEx {100}; +$ALetterEx {200}; +$ALetterEx $ALetterEx {200}; +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; +$NumericEx $NumericEx {100}; +$ALetterEx $NumericEx {200}; +$NumericEx $ALetterEx {200}; +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; +$ALetterEx $ExtendNumLetEx {200}; +$NumericEx $ExtendNumLetEx {100}; +$ExtendNumLetEx $ExtendNumLetEx {200}; +$ExtendNumLetEx $ALetterEx {200}; +$ExtendNumLetEx $NumericEx {100}; diff --git a/lucene/contrib/icu/src/data/uax29/Khmer.rbbi b/lucene/contrib/icu/src/data/uax29/Khmer.rbbi new file mode 100644 index 00000000000..32eefd638f8 --- /dev/null +++ b/lucene/contrib/icu/src/data/uax29/Khmer.rbbi @@ -0,0 +1,61 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Parses Khmer text, with orthographic syllable as token. +# +# The definition of Khmer orthographic syllable is taken from the Unicode Standard. +# +# B = base character (consonant, independent vowel, etc) +$KhmerBase = [\u1780-\u17B3]; +# R = robat +$KhmerRobat = [\u17CC]; +# C = consonant shifter +$KhmerShifter = [\u17C9\u17CA]; +# S = subscript consonant or independent vowel sign +$KhmerSub = ([\u17D2] $KhmerBase); +# V = dependent vowel sign +$KhmerVowel = [\u17B4-\u17C5]; +# Z = zero-width joiner or non-joiner +$KhmerZWC = [\u200C\u200D]; +# O = any other sign +$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; + +$WordJoin = [:Line_Break=Word_Joiner:]; + +$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?; + +$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*; + +# +# default numerical definitions +# +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +!!forward; +$KhmerJoinedSyllableEx {200}; + +# default numeric rules +$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; diff --git a/lucene/contrib/icu/src/data/uax29/Lao.rbbi b/lucene/contrib/icu/src/data/uax29/Lao.rbbi new file mode 100644 index 00000000000..27dcaca156d --- /dev/null +++ b/lucene/contrib/icu/src/data/uax29/Lao.rbbi @@ -0,0 +1,192 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Parses Lao text, with syllable as token. +# +# The definition of Lao syllable is based from: +# +# Syllabification of Lao Script for Line Breaking +# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, +# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP +# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf +# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf +# +# NOTE: +# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper. +# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work. +# +# Syllable structure, where X is the nuclear consonant: +# +# +----+ +# | X5 | +# +----+ +# | X4 | +# +----+----+----+----+----+----+----+-----+ +# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 | +# +----+----+----+----+----+----+----+-----+ +# | X2 | +# +----+ +# | X3 | +# +----+ +# +# X0 represents a vowel which occurs before the nuclear consonant. +# It can always define the beginning of syllable. +$X0 = [\u0EC0-\u0EC4]; +# X1 is a combination consonant which comes before the nuclear consonant, +# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ} +$X1 = [\u0EAB]; +# X represents the nuclear consonant. +$X = [\u0E81-\u0EAE\u0EDC\u0EDD]; +# X2 is a combination consonant which comes after the nuclear consonant, +# which is placed under or next to the nuclear consonant. +$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5]; +# X3 represents a vowel which occurs under the nuclear consonant. +$X3 = [\u0EB8\u0EB9]; +# X4 represents a vowel which occurs above the nuclear consonant. +$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1]; +# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel. +$X5 = [\u0EC8-\u0ECB]; +# X6 represents a consonant vowel, which occurs after the nuclear consonant. +# It functions when the syllable doesn’t have any vowels. And it always exists with X8. +$X6 = [\u0EA7\u0EAD\u0EBD]; +# X7 represents a final vowel. +# However X7_1 always represents the end of syllable and it never exists with tone mark. +$X7 = [\u0EB0\u0EB2\u0EB3]; +# X8 represents an alternate consonant. +$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7]; +# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3. +$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5]; +# X10 represents a sign mark. +# It always occurs at the end of a syllable, but mostly people keep it separate from syllable. +$X10 = [\u0EAF\u0EC6\u0ECC]; + +# Section 1 +$X0_1 = [\u0EC0]; +$X4_1_2 = [\u0EB4\u0EB5]; +$X4_3_4 = [\u0EB6\u0EB7]; +$X4_6 = [\u0EBB]; +$X4_7 = [\u0EB1]; +$X6_2 = [\u0EAD]; +$X6_3 = [\u0EBD]; +$X7_1 = [\u0EB0]; +$X7_2 = [\u0EB2]; +$X10_1 = [\u0EAF]; +$X10_2 = [\u0EC6]; +$X10_3 = [\u0ECC]; + +$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1; +$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7); + +# Section 2 +$X0_2 = [\u0EC1]; + +$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1; +$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3); + +# Section 3 +$X0_3 = [\u0EC2]; +$X8_3 = [\u0E8D]; +$X8_8 = [\u0EA7]; + +$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1; +$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8); + +$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3); + +# Section 4 +$X0_4 = [\u0EC4]; +$X6_1 = [\u0EA7]; + +$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 5 +$X0_5 = [\u0EC3]; + +$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 6 +$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 7 +$X4_1_4 = [\u0EB4-\u0EB7]; + +$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 8 +$X4_5 = [\u0ECD]; + +$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 9 + +$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; +$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1; + +$Rule9 = ($Rule9_1 | $Rule9_2); + +# Section 10 +$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 11 +$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 12 +$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1; + +# Section 13 +$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +# Section 14 +$X7_3 = [\u0EB3]; + +$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; + +$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14); + +$WordJoin = [:Line_Break=Word_Joiner:]; + +$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*; + +# +# default numerical definitions +# +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +!!forward; + +$LaoJoinedSyllableEx {200}; +# default numeric rules +$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; diff --git a/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi b/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi new file mode 100644 index 00000000000..47f541c1835 --- /dev/null +++ b/lucene/contrib/icu/src/data/uax29/Myanmar.rbbi @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Parses Myanmar text, with syllable as token. +# + +$Cons = [[:Other_Letter:]&[:Myanmar:]]; +$Virama = [\u1039]; +$Asat = [\u103A]; + +$WordJoin = [:Line_Break=Word_Joiner:]; + +# +# default numerical definitions +# +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; + +$ConsEx = $Cons ($Extend | $Format)*; +$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*; +$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*; +$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*; + +!!forward; +$MyanmarJoinedSyllableEx {200}; + +# default numeric rules +$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java new file mode 100644 index 00000000000..61d4b011f5c --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java @@ -0,0 +1,171 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.CharacterIterator; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.DictionaryBasedBreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.UTF16; + +/** + * Contain all the issues surrounding BreakIterators in ICU in one place. + * Basically this boils down to the fact that they aren't very friendly to any + * sort of OO design. + *

+ * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to + * BreakIterator from RuleBasedBreakIterator + *

+ * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but + * doesn't actually behave as a subclass: it always returns 0 for + * getRuleStatus(): + * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type + * tags + * @lucene.experimental + */ +abstract class BreakIteratorWrapper { + protected final CharArrayIterator textIterator = new CharArrayIterator(); + protected char text[]; + protected int start; + protected int length; + + abstract int next(); + abstract int current(); + abstract int getRuleStatus(); + abstract void setText(CharacterIterator text); + + void setText(char text[], int start, int length) { + this.text = text; + this.start = start; + this.length = length; + textIterator.setText(text, start, length); + setText(textIterator); + } + + /** + * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so + * treat it like a generic BreakIterator If its any other + * RuleBasedBreakIterator, the rule status can be used for token type. If its + * any other BreakIterator, the rulestatus method is not available, so treat + * it like a generic BreakIterator. + */ + static BreakIteratorWrapper wrap(BreakIterator breakIterator) { + if (breakIterator instanceof RuleBasedBreakIterator + && !(breakIterator instanceof DictionaryBasedBreakIterator)) + return new RBBIWrapper((RuleBasedBreakIterator) breakIterator); + else + return new BIWrapper(breakIterator); + } + + /** + * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not + * a DictionaryBasedBreakIterator) behaves correctly. + */ + static final class RBBIWrapper extends BreakIteratorWrapper { + private final RuleBasedBreakIterator rbbi; + + RBBIWrapper(RuleBasedBreakIterator rbbi) { + this.rbbi = rbbi; + } + + @Override + int current() { + return rbbi.current(); + } + + @Override + int getRuleStatus() { + return rbbi.getRuleStatus(); + } + + @Override + int next() { + return rbbi.next(); + } + + @Override + void setText(CharacterIterator text) { + rbbi.setText(text); + } + } + + /** + * Generic BreakIterator wrapper: Either the rulestatus method is not + * available or always returns 0. Calculate a rulestatus here so it behaves + * like RuleBasedBreakIterator. + * + * Note: This is slower than RuleBasedBreakIterator. + */ + static final class BIWrapper extends BreakIteratorWrapper { + private final BreakIterator bi; + private int status; + + BIWrapper(BreakIterator bi) { + this.bi = bi; + } + + @Override + int current() { + return bi.current(); + } + + @Override + int getRuleStatus() { + return status; + } + + @Override + int next() { + int current = bi.current(); + int next = bi.next(); + status = calcStatus(current, next); + return next; + } + + private int calcStatus(int current, int next) { + if (current == BreakIterator.DONE || next == BreakIterator.DONE) + return RuleBasedBreakIterator.WORD_NONE; + + int begin = start + current; + int end = start + next; + + int codepoint; + for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { + codepoint = UTF16.charAt(text, 0, end, begin); + + if (UCharacter.isDigit(codepoint)) + return RuleBasedBreakIterator.WORD_NUMBER; + else if (UCharacter.isLetter(codepoint)) { + // TODO: try to separately specify ideographic, kana? + // [currently all bundled as letter for this case] + return RuleBasedBreakIterator.WORD_LETTER; + } + } + + return RuleBasedBreakIterator.WORD_NONE; + } + + @Override + void setText(CharacterIterator text) { + bi.setText(text); + status = RuleBasedBreakIterator.WORD_NONE; + } + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java new file mode 100644 index 00000000000..be25dd0eb5c --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CharArrayIterator.java @@ -0,0 +1,118 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.CharacterIterator; + +/** + * Wraps a char[] as CharacterIterator for processing with a BreakIterator + * @lucene.experimental + */ +final class CharArrayIterator implements CharacterIterator { + private char array[]; + private int start; + private int index; + private int length; + private int limit; + + public char [] getText() { + return array; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Set a new region of text to be examined by this iterator + * + * @param array text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(final char array[], int start, int length) { + this.array = array; + this.start = start; + this.index = start; + this.length = length; + this.limit = start + length; + } + + public char current() { + return (index == limit) ? DONE : array[index]; + } + + public char first() { + index = start; + return current(); + } + + public int getBeginIndex() { + return 0; + } + + public int getEndIndex() { + return length; + } + + public int getIndex() { + return index - start; + } + + public char last() { + index = (limit == start) ? limit : limit - 1; + return current(); + } + + public char next() { + if (++index >= limit) { + index = limit; + return DONE; + } else { + return current(); + } + } + + public char previous() { + if (--index < start) { + index = start; + return DONE; + } else { + return current(); + } + } + + public char setIndex(int position) { + if (position < getBeginIndex() || position > getEndIndex()) + throw new IllegalArgumentException("Illegal Position: " + position); + index = start + position; + return current(); + } + + @Override + public Object clone() { + CharArrayIterator clone = new CharArrayIterator(); + clone.setText(array, start, length); + clone.index = index; + return clone; + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java new file mode 100644 index 00000000000..ba394b44db9 --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java @@ -0,0 +1,126 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.BreakIterator; + +/** + * An internal BreakIterator for multilingual text, following recommendations + * from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/) + *

+ * See http://unicode.org/reports/tr29/#Tailoring for the motivation of this + * design. + *

+ * Text is first divided into script boundaries. The processing is then + * delegated to the appropriate break iterator for that specific script. + *

+ * This break iterator also allows you to retrieve the ISO 15924 script code + * associated with a piece of text. + *

+ * See also UAX #29, UTR #24 + * @lucene.experimental + */ +final class CompositeBreakIterator { + private final ICUTokenizerConfig config; + private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT]; + + private BreakIteratorWrapper rbbi; + private final ScriptIterator scriptIterator = new ScriptIterator(); + + private char text[]; + + CompositeBreakIterator(ICUTokenizerConfig config) { + this.config = config; + } + + /** + * Retrieve the next break position. If the RBBI range is exhausted within the + * script boundary, examine the next script boundary. + * + * @return the next break position or BreakIterator.DONE + */ + int next() { + int next = rbbi.next(); + while (next == BreakIterator.DONE && scriptIterator.next()) { + rbbi = getBreakIterator(scriptIterator.getScriptCode()); + rbbi.setText(text, scriptIterator.getScriptStart(), + scriptIterator.getScriptLimit() - scriptIterator.getScriptStart()); + next = rbbi.next(); + } + return (next == BreakIterator.DONE) ? BreakIterator.DONE : next + + scriptIterator.getScriptStart(); + } + + /** + * Retrieve the current break position. + * + * @return the current break position or BreakIterator.DONE + */ + int current() { + final int current = rbbi.current(); + return (current == BreakIterator.DONE) ? BreakIterator.DONE : current + + scriptIterator.getScriptStart(); + } + + /** + * Retrieve the rule status code (token type) from the underlying break + * iterator + * + * @return rule status code (see RuleBasedBreakIterator constants) + */ + int getRuleStatus() { + return rbbi.getRuleStatus(); + } + + /** + * Retrieve the UScript script code for the current token. This code can be + * decoded with UScript into a name or ISO 15924 code. + * + * @return UScript script code for the current token. + */ + int getScriptCode() { + return scriptIterator.getScriptCode(); + } + + /** + * Set a new region of text to be examined by this iterator + * + * @param text buffer of text + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(final char text[], int start, int length) { + this.text = text; + scriptIterator.setText(text, start, length); + if (scriptIterator.next()) { + rbbi = getBreakIterator(scriptIterator.getScriptCode()); + rbbi.setText(text, scriptIterator.getScriptStart(), + scriptIterator.getScriptLimit() - scriptIterator.getScriptStart()); + } else { + rbbi = getBreakIterator(UScript.COMMON); + rbbi.setText(text, 0, 0); + } + } + + private BreakIteratorWrapper getBreakIterator(int scriptCode) { + if (wordBreakers[scriptCode] == null) + wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode)); + return wordBreakers[scriptCode]; + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java new file mode 100644 index 00000000000..4da1e4dc8b4 --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java @@ -0,0 +1,112 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.util.ULocale; + +/** + * Default {@link ICUTokenizerConfig} that is generally applicable + * to many languages. + *

+ * Generally tokenizes Unicode text according to UAX#29 + * ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), + * but with the following tailorings: + *

+ * @lucene.experimental + */ +public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { + /** Token type for words containing ideographic characters */ + public static final String WORD_IDEO = ""; + /** Token type for words containing Japanese kana */ + public static final String WORD_KANA = ""; + /** Token type for words that contain letters */ + public static final String WORD_LETTER = ""; + /** Token type for words that appear to be numbers */ + public static final String WORD_NUMBER = ""; + + /* + * the default breakiterators in use. these can be expensive to + * instantiate, cheap to clone. + */ + private static final BreakIterator rootBreakIterator = + BreakIterator.getWordInstance(ULocale.ROOT); + private static final BreakIterator thaiBreakIterator = + BreakIterator.getWordInstance(new ULocale("th_TH")); + private static final BreakIterator hebrewBreakIterator = + readBreakIterator("Hebrew.brk"); + private static final BreakIterator khmerBreakIterator = + readBreakIterator("Khmer.brk"); + private static final BreakIterator laoBreakIterator = + new LaoBreakIterator(readBreakIterator("Lao.brk")); + private static final BreakIterator myanmarBreakIterator = + readBreakIterator("Myanmar.brk"); + + @Override + public BreakIterator getBreakIterator(int script) { + switch(script) { + case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone(); + case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone(); + case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone(); + case UScript.LAO: return (BreakIterator)laoBreakIterator.clone(); + case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone(); + default: return (BreakIterator)rootBreakIterator.clone(); + } + } + + @Override + public String getType(int script, int ruleStatus) { + switch (ruleStatus) { + case RuleBasedBreakIterator.WORD_IDEO: + return WORD_IDEO; + case RuleBasedBreakIterator.WORD_KANA: + return WORD_KANA; + case RuleBasedBreakIterator.WORD_LETTER: + return WORD_LETTER; + case RuleBasedBreakIterator.WORD_NUMBER: + return WORD_NUMBER; + default: /* some other custom code */ + return ""; + } + } + + private static RuleBasedBreakIterator readBreakIterator(String filename) { + InputStream is = + DefaultICUTokenizerConfig.class.getResourceAsStream(filename); + try { + RuleBasedBreakIterator bi = + RuleBasedBreakIterator.getInstanceFromCompiledRules(is); + is.close(); + return bi; + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java new file mode 100644 index 00000000000..b2022bdad6a --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java @@ -0,0 +1,196 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; + +/** + * Breaks text into words according to UAX #29: Unicode Text Segmentation + * (http://www.unicode.org/reports/tr29/) + *

+ * Words are broken across script boundaries, then segmented according to + * the BreakIterator and typing provided by the {@link ICUTokenizerConfig} + *

+ * @see ICUTokenizerConfig + * @lucene.experimental + */ +public final class ICUTokenizer extends Tokenizer { + private static final int IOBUFFER = 4096; + private final char buffer[] = new char[IOBUFFER]; + /** true length of text in the buffer */ + private int length = 0; + /** length in buffer that can be evaluated safely, up to a safe end point */ + private int usableLength = 0; + /** accumulated offset of previous buffers for this reader, for offsetAtt */ + private int offset = 0; + + private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */ + private final ICUTokenizerConfig config; + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class); + + /** + * Construct a new ICUTokenizer that breaks text into words from the given + * Reader. + *

+ * The default script-specific handling is used. + * + * @param input Reader containing text to tokenize. + * @see DefaultICUTokenizerConfig + */ + public ICUTokenizer(Reader input) { + this(input, new DefaultICUTokenizerConfig()); + } + + /** + * Construct a new ICUTokenizer that breaks text into words from the given + * Reader, using a tailored BreakIterator configuration. + * + * @param input Reader containing text to tokenize. + * @param config Tailored BreakIterator configuration + */ + public ICUTokenizer(Reader input, ICUTokenizerConfig config) { + super(input); + this.config = config; + breaker = new CompositeBreakIterator(config); + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (length == 0) + refill(); + while (!incrementTokenBuffer()) { + refill(); + if (length <= 0) // no more bytes to read; + return false; + } + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + breaker.setText(buffer, 0, 0); + length = usableLength = offset = 0; + } + + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + reset(); + } + + @Override + public void end() throws IOException { + final int finalOffset = (length < 0) ? offset : offset + length; + offsetAtt.setOffset(finalOffset, finalOffset); + } + + /* + * This tokenizes text based upon the longest matching rule, and because of + * this, isn't friendly to a Reader. + * + * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of + * text, the last unambiguous break point is found (in this implementation: + * white space character) Any remaining characters represent possible partial + * words, so are appended to the front of the next chunk. + * + * There is the possibility that there are no unambiguous break points within + * an entire 4kB chunk of text (binary data). So there is a maximum word limit + * of 4kB since it will not try to grow the buffer in this case. + */ + + /** + * Returns the last unambiguous break position in the text. + * + * @return position of character, or -1 if one does not exist + */ + private int findSafeEnd() { + for (int i = length - 1; i >= 0; i--) + if (UCharacter.isWhitespace(buffer[i])) + return i + 1; + return -1; + } + + /** + * Refill the buffer, accumulating the offset and setting usableLength to the + * last unambiguous break position + * + * @throws IOException + */ + private void refill() throws IOException { + offset += usableLength; + int leftover = length - usableLength; + System.arraycopy(buffer, usableLength, buffer, 0, leftover); + int requested = buffer.length - leftover; + int returned = input.read(buffer, leftover, requested); + length = returned < 0 ? leftover : returned + leftover; + if (returned < requested) /* reader has been emptied, process the rest */ + usableLength = length; + else { /* still more data to be read, find a safe-stopping place */ + usableLength = findSafeEnd(); + if (usableLength < 0) + usableLength = length; /* + * more than IOBUFFER of text without space, + * gonna possibly truncate tokens + */ + } + + breaker.setText(buffer, 0, Math.max(0, usableLength)); + } + + /* + * return true if there is a token from the buffer, or null if it is + * exhausted. + */ + private boolean incrementTokenBuffer() { + int start = breaker.current(); + if (start == BreakIterator.DONE) + return false; // BreakIterator exhausted + + // find the next set of boundaries, skipping over non-tokens (rule status 0) + int end = breaker.next(); + while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) { + start = end; + end = breaker.next(); + } + + if (start == BreakIterator.DONE) + return false; // BreakIterator exhausted + + termAtt.copyBuffer(buffer, start, end - start); + offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end)); + typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus())); + scriptAtt.setCode(breaker.getScriptCode()); + + return true; + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java new file mode 100644 index 00000000000..cadc2d64f51 --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java @@ -0,0 +1,33 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.ibm.icu.text.BreakIterator; + +/** + * Class that allows for tailored Unicode Text Segmentation on + * a per-writing system basis. + * @lucene.experimental + */ +public abstract class ICUTokenizerConfig { + /** Return a breakiterator capable of processing a given script. */ + public abstract BreakIterator getBreakIterator(int script); + /** Return a token type value for a given script and BreakIterator + * rule status. */ + public abstract String getType(int script, int ruleStatus); +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java new file mode 100644 index 00000000000..ffd4c337fd3 --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java @@ -0,0 +1,226 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.CharacterIterator; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.UnicodeSet; + +/** + * Syllable iterator for Lao text. + *

+ * This breaks Lao text into syllables according to: + * Syllabification of Lao Script for Line Breaking + * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, + * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP. + *

    + *
  • http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf + *
  • http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf + *
+ *

+ * Most work is accomplished with RBBI rules, however some additional special logic is needed + * that cannot be coded in a grammar, and this is implemented here. + *

+ * For example, what appears to be a final consonant might instead be part of the next syllable. + * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules. + *

+ * Take for instance the text ກວ່າດອກ + * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal. + * What LaoBreakIterator does, according to the paper: + *

    + *
  1. backtrack and remove the ດ from the last syllable, placing it on the current syllable. + *
  2. verify the modified previous syllable (ກວ່າ ) is still legal. + *
  3. verify the modified current syllable (ດອກ) is now legal. + *
  4. If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character. + *
+ *

+ * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper. + * This is the issue of combining marks being in the wrong order (typos). + * @lucene.experimental + */ +public class LaoBreakIterator extends BreakIterator { + RuleBasedBreakIterator rules; + CharArrayIterator text; + + CharArrayIterator working = new CharArrayIterator(); + int workingOffset = 0; + + CharArrayIterator verifyText = new CharArrayIterator(); + RuleBasedBreakIterator verify; + + private static final UnicodeSet laoSet; + static { + laoSet = new UnicodeSet("[:Lao:]"); + laoSet.compact(); + laoSet.freeze(); + } + + public LaoBreakIterator(RuleBasedBreakIterator rules) { + this.rules = (RuleBasedBreakIterator) rules.clone(); + this.verify = (RuleBasedBreakIterator) rules.clone(); + } + + @Override + public int current() { + int current = rules.current(); + return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current; + } + + @Override + public int first() { + working.setText(this.text.getText(), this.text.getStart(), this.text.getLength()); + rules.setText(working); + workingOffset = 0; + int first = rules.first(); + return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first; + } + + @Override + public int following(int offset) { + throw new UnsupportedOperationException(); + } + + @Override + public CharacterIterator getText() { + return text; + } + + @Override + public int last() { + throw new UnsupportedOperationException(); + } + + @Override + public int next() { + int current = current(); + int next = rules.next(); + if (next == BreakIterator.DONE) + return next; + else + next += workingOffset; + + char c = working.current(); + int following = rules.next(); // lookahead + if (following != BreakIterator.DONE) { + following += workingOffset; + if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) { + workingOffset = next - 1; + working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset); + return next - 1; + } + rules.previous(); // undo the lookahead + } + + return next; + } + + @Override + public int next(int n) { + if (n < 0) + throw new UnsupportedOperationException("Backwards traversal is unsupported"); + + int result = current(); + while (n > 0) { + result = next(); + --n; + } + return result; + } + + @Override + public int previous() { + throw new UnsupportedOperationException("Backwards traversal is unsupported"); + } + + @Override + public void setText(CharacterIterator text) { + if (!(text instanceof CharArrayIterator)) + throw new UnsupportedOperationException("unsupported CharacterIterator"); + this.text = (CharArrayIterator) text; + ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength()); + working.setText(this.text.getText(), this.text.getStart(), this.text.getLength()); + rules.setText(working); + workingOffset = 0; + } + + @Override + public void setText(String newText) { + CharArrayIterator ci = new CharArrayIterator(); + ci.setText(newText.toCharArray(), 0, newText.length()); + setText(ci); + } + + private boolean verifyPushBack(int current, int next) { + int shortenedSyllable = next - current - 1; + + verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable); + verify.setText(verifyText); + if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0) + return false; + + + verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1); + verify.setText(verifyText); + + return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0); + } + + // TODO: only bubblesort around runs of combining marks, instead of the entire text. + private void ccReorder(char[] text, int start, int length) { + boolean reordered; + do { + int prevCC = 0; + reordered = false; + for (int i = start; i < start + length; i++) { + final char c = text[i]; + final int cc = UCharacter.getCombiningClass(c); + if (cc > 0 && cc < prevCC) { + // swap + text[i] = text[i - 1]; + text[i - 1] = c; + reordered = true; + } else { + prevCC = cc; + } + } + + } while (reordered == true); + } + + /** + * Clone method. Creates another LaoBreakIterator with the same behavior + * and current state as this one. + * @return The clone. + */ + @Override + public Object clone() { + LaoBreakIterator other = (LaoBreakIterator) super.clone(); + other.rules = (RuleBasedBreakIterator) rules.clone(); + other.verify = (RuleBasedBreakIterator) verify.clone(); + if (text != null) + other.text = (CharArrayIterator) text.clone(); + if (working != null) + other.working = (CharArrayIterator) working.clone(); + if (verifyText != null) + other.verifyText = (CharArrayIterator) verifyText.clone(); + return other; + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java new file mode 100644 index 00000000000..4c327bc04fe --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java @@ -0,0 +1,170 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UTF16; + +/** + * An iterator that locates ISO 15924 script boundaries in text. + *

+ * This is not the same as simply looking at the Unicode block, or even the + * Script property. Some characters are 'common' across multiple scripts, and + * some 'inherit' the script value of text surrounding them. + *

+ * This is similar to ICU (internal-only) UScriptRun, with the following + * differences: + *

    + *
  • Doesn't attempt to match paired punctuation. For tokenization purposes, this + * is not necessary. Its also quite expensive. + *
  • Non-spacing marks inherit the script of their base character, following + * recommendations from UTR #24. + *
+ * @lucene.experimental + */ +final class ScriptIterator { + private char text[]; + private int start; + private int limit; + private int index; + + private int scriptStart; + private int scriptLimit; + private int scriptCode; + + /** + * Get the start of this script run + * + * @return start position of script run + */ + int getScriptStart() { + return scriptStart; + } + + /** + * Get the index of the first character after the end of this script run + * + * @return position of the first character after this script run + */ + int getScriptLimit() { + return scriptLimit; + } + + /** + * Get the UScript script code for this script run + * + * @return code for the script of the current run + */ + int getScriptCode() { + return scriptCode; + } + + /** + * Iterates to the next script run, returning true if one exists. + * + * @return true if there is another script run, false otherwise. + */ + boolean next() { + if (scriptLimit >= limit) + return false; + + scriptCode = UScript.COMMON; + scriptStart = scriptLimit; + + while (index < limit) { + final int ch = UTF16.charAt(text, start, limit, index - start); + final int sc = getScript(ch); + + /* + * From UTR #24: Implementations that determine the boundaries between + * characters of given scripts should never break between a non-spacing + * mark and its base character. Thus for boundary determinations and + * similar sorts of processing, a non-spacing mark — whatever its script + * value — should inherit the script value of its base character. + */ + if (isSameScript(scriptCode, sc) + || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) { + index += UTF16.getCharCount(ch); + + /* + * Inherited or Common becomes the script code of the surrounding text. + */ + if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { + scriptCode = sc; + } + + } else { + break; + } + } + + scriptLimit = index; + return true; + } + + /** Determine if two scripts are compatible. */ + private static boolean isSameScript(int scriptOne, int scriptTwo) { + return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED + || scriptOne == scriptTwo; + } + + /** + * Set a new region of text to be examined by this iterator + * + * @param text text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + void setText(char text[], int start, int length) { + this.text = text; + this.start = start; + this.index = start; + this.limit = start + length; + this.scriptStart = start; + this.scriptLimit = start; + this.scriptCode = UScript.INVALID_CODE; + } + + /** linear fast-path for basic latin case */ + private static final int basicLatin[] = new int[128]; + + static { + for (int i = 0; i < basicLatin.length; i++) + basicLatin[i] = UScript.getScript(i); + } + + /** fast version of UScript.getScript(). Basic Latin is an array lookup */ + private static int getScript(int codepoint) { + if (0 <= codepoint && codepoint < basicLatin.length) + return basicLatin[codepoint]; + else + return UScript.getScript(codepoint); + } +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html new file mode 100644 index 00000000000..56eca07febb --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/package.html @@ -0,0 +1,22 @@ + + + + +Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm. + + diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java new file mode 100644 index 00000000000..b60809c962a --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttribute.java @@ -0,0 +1,51 @@ +package org.apache.lucene.analysis.icu.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.Attribute; + +import com.ibm.icu.lang.UScript; // javadoc @link + +/** + * This attribute stores the UTR #24 script value for a token of text. + * @lucene.experimental + */ +public interface ScriptAttribute extends Attribute { + /** + * Get the numeric code for this script value. + * This is the constant value from {@link UScript}. + * @return numeric code + */ + public int getCode(); + /** + * Set the numeric code for this script value. + * This is the constant value from {@link UScript}. + * @param code numeric code + */ + public void setCode(int code); + /** + * Get the full name. + * @return UTR #24 full name. + */ + public String getName(); + /** + * Get the abbreviated name. + * @return UTR #24 abbreviated name. + */ + public String getShortName(); +} diff --git a/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java new file mode 100644 index 00000000000..7e33ee7875f --- /dev/null +++ b/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/ScriptAttributeImpl.java @@ -0,0 +1,83 @@ +package org.apache.lucene.analysis.icu.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; + +import org.apache.lucene.util.AttributeImpl; + +import com.ibm.icu.lang.UScript; + +/** + * Implementation of {@link ScriptAttribute} that stores the script + * as an integer. + * @lucene.experimental + */ +public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable, Serializable { + private int code = UScript.COMMON; + + public int getCode() { + return code; + } + + public void setCode(int code) { + this.code = code; + } + + public String getName() { + return UScript.getName(code); + } + + public String getShortName() { + return UScript.getShortName(code); + } + + @Override + public void clear() { + code = UScript.COMMON; + } + + @Override + public void copyTo(AttributeImpl target) { + ScriptAttribute t = (ScriptAttribute) target; + t.setCode(code); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + + if (other instanceof ScriptAttributeImpl) { + return ((ScriptAttributeImpl) other).code == code; + } + + return false; + } + + @Override + public int hashCode() { + return code; + } + + @Override + public String toString() { + return "script=" + getName(); + } +} diff --git a/lucene/contrib/icu/src/java/overview.html b/lucene/contrib/icu/src/java/overview.html index 91674e1ce2d..47a833ca88a 100644 --- a/lucene/contrib/icu/src/java/overview.html +++ b/lucene/contrib/icu/src/java/overview.html @@ -30,6 +30,8 @@ performance, keeping current with the Unicode Standard, and providing richer APIs. This module exposes the following functionality:

    +
  • Text Segmentation: Tokenizes text based on + properties and rules defined in Unicode.
  • Collation: Compare strings according to the conventions and standards of a particular language, region or country.
  • Normalization: Converts text to a unique, @@ -42,6 +44,35 @@ APIs. This module exposes the following functionality: a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese

+

Text Segmentation

+

+Text Segmentation (Tokenization) divides document and query text into index terms +(typically words). Unicode provides special properties and rules so that this can +be done in a manner that works well with most languages. +

+

+Text Segmentation implements the word segmentation specified in +Unicode Text Segmentation. +Additionally the algorithm can be tailored based on writing system, for example +text in the Thai script is automatically delegated to a dictionary-based segmentation +algorithm. +

+

Use Cases

+
    +
  • + As a more thorough replacement for StandardTokenizer that works well for + most languages. +
  • +
+

Example Usages

+

Tokenizing multilanguage text

+
+  /**
+   * This tokenizer will work well in general for most languages.
+   */
+  Tokenizer tokenizer = new ICUTokenizer(reader);
+
+

Collation

ICUCollationKeyFilter diff --git a/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk new file mode 100644 index 00000000000..25e0b18b111 Binary files /dev/null and b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Hebrew.brk differ diff --git a/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk new file mode 100644 index 00000000000..528c5bc4c42 Binary files /dev/null and b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Khmer.brk differ diff --git a/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk new file mode 100644 index 00000000000..4d3dc11ee72 Binary files /dev/null and b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Lao.brk differ diff --git a/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk new file mode 100644 index 00000000000..656304ee158 Binary files /dev/null and b/lucene/contrib/icu/src/resources/org/apache/lucene/analysis/icu/segmentation/Myanmar.brk differ diff --git a/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java b/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java new file mode 100644 index 00000000000..02b9d0a45f7 --- /dev/null +++ b/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestCharArrayIterator.java @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.CharacterIterator; + +import org.apache.lucene.util.LuceneTestCase; + +public class TestCharArrayIterator extends LuceneTestCase { + public void testBasicUsage() { + CharArrayIterator ci = new CharArrayIterator(); + ci.setText("testing".toCharArray(), 0, "testing".length()); + assertEquals(0, ci.getBeginIndex()); + assertEquals(7, ci.getEndIndex()); + assertEquals(0, ci.getIndex()); + assertEquals('t', ci.current()); + assertEquals('e', ci.next()); + assertEquals('g', ci.last()); + assertEquals('n', ci.previous()); + assertEquals('t', ci.first()); + assertEquals(CharacterIterator.DONE, ci.previous()); + } + + public void testFirst() { + CharArrayIterator ci = new CharArrayIterator(); + ci.setText("testing".toCharArray(), 0, "testing".length()); + ci.next(); + // Sets the position to getBeginIndex() and returns the character at that position. + assertEquals('t', ci.first()); + assertEquals(ci.getBeginIndex(), ci.getIndex()); + // or DONE if the text is empty + ci.setText(new char[] {}, 0, 0); + assertEquals(CharacterIterator.DONE, ci.first()); + } + + public void testLast() { + CharArrayIterator ci = new CharArrayIterator(); + ci.setText("testing".toCharArray(), 0, "testing".length()); + // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) + // and returns the character at that position. + assertEquals('g', ci.last()); + assertEquals(ci.getIndex(), ci.getEndIndex() - 1); + // or DONE if the text is empty + ci.setText(new char[] {}, 0, 0); + assertEquals(CharacterIterator.DONE, ci.last()); + assertEquals(ci.getEndIndex(), ci.getIndex()); + } + + public void testCurrent() { + CharArrayIterator ci = new CharArrayIterator(); + // Gets the character at the current position (as returned by getIndex()). + ci.setText("testing".toCharArray(), 0, "testing".length()); + assertEquals('t', ci.current()); + ci.last(); + ci.next(); + // or DONE if the current position is off the end of the text. + assertEquals(CharacterIterator.DONE, ci.current()); + } + + public void testNext() { + CharArrayIterator ci = new CharArrayIterator(); + ci.setText("te".toCharArray(), 0, 2); + // Increments the iterator's index by one and returns the character at the new index. + assertEquals('e', ci.next()); + assertEquals(1, ci.getIndex()); + // or DONE if the new position is off the end of the text range. + assertEquals(CharacterIterator.DONE, ci.next()); + assertEquals(ci.getEndIndex(), ci.getIndex()); + } + + public void testSetIndex() { + CharArrayIterator ci = new CharArrayIterator(); + ci.setText("test".toCharArray(), 0, "test".length()); + try { + ci.setIndex(5); + fail(); + } catch (Exception e) { + assertTrue(e instanceof IllegalArgumentException); + } + } + + public void testClone() { + char text[] = "testing".toCharArray(); + CharArrayIterator ci = new CharArrayIterator(); + ci.setText(text, 0, text.length); + ci.next(); + CharArrayIterator ci2 = (CharArrayIterator) ci.clone(); + assertEquals(ci.getIndex(), ci2.getIndex()); + assertEquals(ci.next(), ci2.next()); + assertEquals(ci.last(), ci2.last()); + } + + +} diff --git a/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java new file mode 100644 index 00000000000..ca7b178984b --- /dev/null +++ b/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -0,0 +1,225 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; + +import java.util.Arrays; + +public class TestICUTokenizer extends BaseTokenStreamTestCase { + + public void testHugeDoc() throws IOException { + StringBuilder sb = new StringBuilder(); + char whitespace[] = new char[4094]; + Arrays.fill(whitespace, ' '); + sb.append(whitespace); + sb.append("testing 1234"); + String input = sb.toString(); + ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input)); + assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); + } + + public void testHugeTerm2() throws IOException { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 40960; i++) { + sb.append('a'); + } + String input = sb.toString(); + ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input)); + char token[] = new char[4096]; + Arrays.fill(token, 'a'); + String expectedToken = new String(token); + String expected[] = { + expectedToken, expectedToken, expectedToken, + expectedToken, expectedToken, expectedToken, + expectedToken, expectedToken, expectedToken, + expectedToken + }; + assertTokenStreamContents(tokenizer, expected); + } + + private Analyzer a = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer tokenizer = new ICUTokenizer(reader); + TokenFilter filter = new ICUNormalizer2Filter(tokenizer); + return new TokenStreamComponents(tokenizer, filter); + } + }; + + public void testArmenian() throws Exception { + assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", + new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", + "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } ); + } + + public void testAmharic() throws Exception { + assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም", + new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } ); + } + + public void testArabic() throws Exception { + assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.", + new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", + "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); + } + + public void testAramaic() throws Exception { + assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀", + new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", + "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"}); + } + + public void testBengali() throws Exception { + assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।", + new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার", + "শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" }); + } + + public void testFarsi() throws Exception { + assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.", + new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی", + "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" }); + } + + public void testGreek() throws Exception { + assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.", + new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που", + "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" }); + } + + public void testLao() throws Exception { + assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" }); + } + + public void testThai() throws Exception { + assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔", + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"}); + } + + public void testTibetan() throws Exception { + assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །", + new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" }); + } + + /* + * For chinese, tokenize as char (these can later form bigrams or whatever) + * TODO: why do full-width numerics have no word-break prop? + */ + public void testChinese() throws Exception { + assertAnalyzesTo(a, "我是中国人。 1234 Tests ", + new String[] { "我", "是", "中", "国", "人", "tests"}); + } + + public void testEmpty() throws Exception { + assertAnalyzesTo(a, "", new String[] {}); + assertAnalyzesTo(a, ".", new String[] {}); + assertAnalyzesTo(a, " ", new String[] {}); + } + + /* test various jira issues this analyzer is related to */ + + public void testLUCENE1545() throws Exception { + /* + * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E. + * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost. + * Expected result is only on token "moͤchte". + */ + assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); + } + + /* Tests from StandardAnalyzer, just to show behavior is similar */ + public void testAlphanumericSA() throws Exception { + // alphanumeric tokens + assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); + assertAnalyzesTo(a, "2B", new String[]{"2b"}); + } + + public void testDelimitersSA() throws Exception { + // other delimiters: "-", "/", "," + assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"}); + assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"}); + assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"}); + } + + public void testApostrophesSA() throws Exception { + // internal apostrophes: O'Reilly, you're, O'Reilly's + assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); + assertAnalyzesTo(a, "you're", new String[]{"you're"}); + assertAnalyzesTo(a, "she's", new String[]{"she's"}); + assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); + assertAnalyzesTo(a, "don't", new String[]{"don't"}); + assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); + } + + public void testNumericSA() throws Exception { + // floating point, serial, model numbers, ip addresses, etc. + // every other segment must have at least one digit + assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); + assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); + assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); + assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); + } + + public void testTextWithNumbersSA() throws Exception { + // numbers + assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); + } + + public void testVariousTextSA() throws Exception { + // various + assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); + assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); + assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); + assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); + } + + public void testKoreanSA() throws Exception { + // Korean words + assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"}); + } + + public void testReusableTokenStream() throws Exception { + assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །", + new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", + "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" }); + } + + public void testOffsets() throws Exception { + assertAnalyzesTo(a, "David has 5000 bones", + new String[] {"david", "has", "5000", "bones"}, + new int[] {0, 6, 10, 15}, + new int[] {5, 9, 14, 20}); + } + + public void testTypes() throws Exception { + assertAnalyzesTo(a, "David has 5000 bones", + new String[] {"david", "has", "5000", "bones"}, + new String[] { "", "", "", "" }); + } +} diff --git a/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java b/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java new file mode 100644 index 00000000000..a46d0907491 --- /dev/null +++ b/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestLaoBreakIterator.java @@ -0,0 +1,90 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.InputStream; + +import org.apache.lucene.util.LuceneTestCase; + +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.UTF16; + +/** + * Tests LaoBreakIterator and its RBBI rules + */ +public class TestLaoBreakIterator extends LuceneTestCase { + private BreakIterator wordIterator; + + @Override + protected void setUp() throws Exception { + super.setUp(); + InputStream is = getClass().getResourceAsStream("Lao.brk"); + wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is)); + is.close(); + } + + private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) { + char text[] = sourceText.toCharArray(); + CharArrayIterator ci = new CharArrayIterator(); + ci.setText(text, 0, text.length); + iterator.setText(ci); + + for (int i = 0; i < tokens.length; i++) { + int start, end; + do { + start = iterator.current(); + end = iterator.next(); + } while (end != BreakIterator.DONE && !isWord(text, start, end)); + assertTrue(start != BreakIterator.DONE); + assertTrue(end != BreakIterator.DONE); + assertEquals(tokens[i], new String(text, start, end - start)); + } + + assertTrue(iterator.next() == BreakIterator.DONE); + } + + protected boolean isWord(char text[], int start, int end) { + int codepoint; + for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) { + codepoint = UTF16.charAt(text, 0, end, start); + + if (UCharacter.isLetterOrDigit(codepoint)) + return true; + } + + return false; + } + + public void testBasicUsage() throws Exception { + assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" }); + assertBreaksTo(wordIterator, "ຜູ້​ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" }); + assertBreaksTo(wordIterator, "", new String[] {}); + assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" }); + } + + public void testNumerics() throws Exception { + assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" }); + assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" }); + } + + public void testTextAndNumerics() throws Exception { + assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" }); + } +} diff --git a/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java b/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java new file mode 100644 index 00000000000..c788a58eed8 --- /dev/null +++ b/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/RBBIRuleCompiler.java @@ -0,0 +1,101 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.lang.reflect.Method; + +import com.ibm.icu.text.RuleBasedBreakIterator; + +/** + * Command-line utility to converts RuleBasedBreakIterator (.rbbi) files into + * binary compiled form (.brk). + */ +public class RBBIRuleCompiler { + + static String getRules(File ruleFile) throws IOException { + StringBuilder rules = new StringBuilder(); + InputStream in = new FileInputStream(ruleFile); + BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String line = null; + while ((line = cin.readLine()) != null) { + if (!line.startsWith("#")) + rules.append(line); + rules.append('\n'); + } + cin.close(); + in.close(); + return rules.toString(); + } + + static void compile(File srcDir, File destDir) throws Exception { + File files[] = srcDir.listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith("rbbi"); + }}); + if (files == null) throw new IOException("Path does not exist: " + srcDir); + for (int i = 0; i < files.length; i++) { + File file = files[i]; + File outputFile = new File(destDir, + file.getName().replaceAll("rbbi$", "brk")); + String rules = getRules(file); + System.err.print("Compiling " + file.getName() + " to " + + outputFile.getName() + ": "); + /* + * if there is a syntax error, compileRules() may succeed. the way to + * check is to try to instantiate from the string. additionally if the + * rules are invalid, you can get a useful syntax error. + */ + try { + new RuleBasedBreakIterator(rules); + } catch (IllegalArgumentException e) { + /* + * do this intentionally, so you don't get a massive stack trace + * instead, get a useful syntax error! + */ + System.err.println(e.getMessage()); + System.exit(1); + } + FileOutputStream os = new FileOutputStream(outputFile); + // RBBIRuleBuilder.compileRules(rules, os); + Class builderClass = Class.forName("com.ibm.icu.text.RBBIRuleBuilder"); + Method method = builderClass.getDeclaredMethod("compileRules", String.class, OutputStream.class); + method.setAccessible(true); + method.invoke(null, rules, os); + os.close(); + System.err.println(outputFile.length() + " bytes."); + } + } + + public static void main(String args[]) throws Exception { + if (args.length < 2) { + System.err.println("Usage: RBBIRuleComputer "); + System.exit(1); + } + compile(new File(args[0]), new File(args[1])); + System.exit(0); + } +}