mirror of https://github.com/apache/lucene.git
LUCENE-2414: Add ICUTokenizer, tailorable impl of Unicode Text Segmentation
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940447 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
98c47c57e0
commit
5abbf3429c
|
@ -140,6 +140,11 @@ New features
|
|||
* LUCENE-2298: Add analyzers/stempel, an algorithmic stemmer with support for
|
||||
the Polish language. (Andrzej Bialecki via Robert Muir)
|
||||
|
||||
* LUCENE-2414: Add ICUTokenizer, a tailorable tokenizer that implements Unicode
|
||||
Text Segmentation. This tokenizer is useful for documents or collections with
|
||||
multiple languages. The default configuration includes special support for
|
||||
Thai, Lao, Myanmar, and Khmer. (Robert Muir, Uwe Schindler)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -43,7 +43,39 @@
|
|||
<target name="gennorm2">
|
||||
<echo>Warning: only works on a big-endian platform!</echo>
|
||||
<exec executable="gennorm2" failonerror="true">
|
||||
<arg line="-v -s ${gennorm2.src.dir} ${gennorm2.src.files} -o ${gennorm2.dst}"/>
|
||||
<arg value="-v"/>
|
||||
<arg value="-s"/>
|
||||
<arg value="${gennorm2.src.dir}"/>
|
||||
<arg value="${gennorm2.src.files}"/>
|
||||
<arg value="-o"/>
|
||||
<arg value="${gennorm2.dst}"/>
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
<property name="rbbi.src.dir" location="src/data/uax29"/>
|
||||
<property name="rbbi.dst.dir" location="src/resources/org/apache/lucene/analysis/icu/segmentation"/>
|
||||
|
||||
<target name="genrbbi" depends="compile-tools">
|
||||
<mkdir dir="${rbbi.dst.dir}"/>
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.icu.RBBIRuleCompiler"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true">
|
||||
<classpath>
|
||||
<path refid="additional.dependencies"/>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
<arg value="${rbbi.src.dir}"/>
|
||||
<arg value="${rbbi.dst.dir}"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="compile-tools">
|
||||
<compile
|
||||
srcdir="src/tools/java"
|
||||
destdir="${build.dir}/classes/tools">
|
||||
<classpath refid="classpath"/>
|
||||
</compile>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# This is an example of rule tailoring for Hebrew.
|
||||
# In this example the single-quote is added to the Extend category
|
||||
# The double-quote is added to the MidLetter category.
|
||||
#
|
||||
!!chain;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}\u0027];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}\u0022];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
|
||||
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$ALetterEx $ALetterEx {200};
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
$NumericEx $NumericEx {100};
|
||||
$ALetterEx $NumericEx {200};
|
||||
$NumericEx $ALetterEx {200};
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
$ALetterEx $ExtendNumLetEx {200};
|
||||
$NumericEx $ExtendNumLetEx {100};
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200};
|
||||
$ExtendNumLetEx $ALetterEx {200};
|
||||
$ExtendNumLetEx $NumericEx {100};
|
|
@ -0,0 +1,61 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# Parses Khmer text, with orthographic syllable as token.
|
||||
#
|
||||
# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
|
||||
#
|
||||
# B = base character (consonant, independent vowel, etc)
|
||||
$KhmerBase = [\u1780-\u17B3];
|
||||
# R = robat
|
||||
$KhmerRobat = [\u17CC];
|
||||
# C = consonant shifter
|
||||
$KhmerShifter = [\u17C9\u17CA];
|
||||
# S = subscript consonant or independent vowel sign
|
||||
$KhmerSub = ([\u17D2] $KhmerBase);
|
||||
# V = dependent vowel sign
|
||||
$KhmerVowel = [\u17B4-\u17C5];
|
||||
# Z = zero-width joiner or non-joiner
|
||||
$KhmerZWC = [\u200C\u200D];
|
||||
# O = any other sign
|
||||
$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
|
||||
|
||||
$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
$KhmerJoinedSyllableEx {200};
|
||||
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -0,0 +1,192 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Parses Lao text, with syllable as token.
|
||||
#
|
||||
# The definition of Lao syllable is based from:
|
||||
#
|
||||
# Syllabification of Lao Script for Line Breaking
|
||||
# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
|
||||
# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
|
||||
# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
|
||||
# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
|
||||
#
|
||||
# NOTE:
|
||||
# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
|
||||
# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
|
||||
#
|
||||
# Syllable structure, where X is the nuclear consonant:
|
||||
#
|
||||
# +----+
|
||||
# | X5 |
|
||||
# +----+
|
||||
# | X4 |
|
||||
# +----+----+----+----+----+----+----+-----+
|
||||
# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
|
||||
# +----+----+----+----+----+----+----+-----+
|
||||
# | X2 |
|
||||
# +----+
|
||||
# | X3 |
|
||||
# +----+
|
||||
#
|
||||
# X0 represents a vowel which occurs before the nuclear consonant.
|
||||
# It can always define the beginning of syllable.
|
||||
$X0 = [\u0EC0-\u0EC4];
|
||||
# X1 is a combination consonant which comes before the nuclear consonant,
|
||||
# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
|
||||
$X1 = [\u0EAB];
|
||||
# X represents the nuclear consonant.
|
||||
$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
|
||||
# X2 is a combination consonant which comes after the nuclear consonant,
|
||||
# which is placed under or next to the nuclear consonant.
|
||||
$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
|
||||
# X3 represents a vowel which occurs under the nuclear consonant.
|
||||
$X3 = [\u0EB8\u0EB9];
|
||||
# X4 represents a vowel which occurs above the nuclear consonant.
|
||||
$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
|
||||
# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
|
||||
$X5 = [\u0EC8-\u0ECB];
|
||||
# X6 represents a consonant vowel, which occurs after the nuclear consonant.
|
||||
# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
|
||||
$X6 = [\u0EA7\u0EAD\u0EBD];
|
||||
# X7 represents a final vowel.
|
||||
# However X7_1 always represents the end of syllable and it never exists with tone mark.
|
||||
$X7 = [\u0EB0\u0EB2\u0EB3];
|
||||
# X8 represents an alternate consonant.
|
||||
$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
|
||||
# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
|
||||
$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
|
||||
# X10 represents a sign mark.
|
||||
# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
|
||||
$X10 = [\u0EAF\u0EC6\u0ECC];
|
||||
|
||||
# Section 1
|
||||
$X0_1 = [\u0EC0];
|
||||
$X4_1_2 = [\u0EB4\u0EB5];
|
||||
$X4_3_4 = [\u0EB6\u0EB7];
|
||||
$X4_6 = [\u0EBB];
|
||||
$X4_7 = [\u0EB1];
|
||||
$X6_2 = [\u0EAD];
|
||||
$X6_3 = [\u0EBD];
|
||||
$X7_1 = [\u0EB0];
|
||||
$X7_2 = [\u0EB2];
|
||||
$X10_1 = [\u0EAF];
|
||||
$X10_2 = [\u0EC6];
|
||||
$X10_3 = [\u0ECC];
|
||||
|
||||
$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
|
||||
$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
|
||||
|
||||
# Section 2
|
||||
$X0_2 = [\u0EC1];
|
||||
|
||||
$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
|
||||
$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
|
||||
|
||||
# Section 3
|
||||
$X0_3 = [\u0EC2];
|
||||
$X8_3 = [\u0E8D];
|
||||
$X8_8 = [\u0EA7];
|
||||
|
||||
$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
|
||||
$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
|
||||
|
||||
$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
|
||||
|
||||
# Section 4
|
||||
$X0_4 = [\u0EC4];
|
||||
$X6_1 = [\u0EA7];
|
||||
|
||||
$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 5
|
||||
$X0_5 = [\u0EC3];
|
||||
|
||||
$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 6
|
||||
$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 7
|
||||
$X4_1_4 = [\u0EB4-\u0EB7];
|
||||
|
||||
$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 8
|
||||
$X4_5 = [\u0ECD];
|
||||
|
||||
$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 9
|
||||
|
||||
$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
|
||||
|
||||
$Rule9 = ($Rule9_1 | $Rule9_2);
|
||||
|
||||
# Section 10
|
||||
$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 11
|
||||
$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 12
|
||||
$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
|
||||
|
||||
# Section 13
|
||||
$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 14
|
||||
$X7_3 = [\u0EB3];
|
||||
|
||||
$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
|
||||
$LaoJoinedSyllableEx {200};
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -0,0 +1,50 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# Parses Myanmar text, with syllable as token.
|
||||
#
|
||||
|
||||
$Cons = [[:Other_Letter:]&[:Myanmar:]];
|
||||
$Virama = [\u1039];
|
||||
$Asat = [\u103A];
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
$ConsEx = $Cons ($Extend | $Format)*;
|
||||
$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
|
||||
$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
|
||||
$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
|
||||
|
||||
!!forward;
|
||||
$MyanmarJoinedSyllableEx {200};
|
||||
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -0,0 +1,171 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.DictionaryBasedBreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* Contain all the issues surrounding BreakIterators in ICU in one place.
|
||||
* Basically this boils down to the fact that they aren't very friendly to any
|
||||
* sort of OO design.
|
||||
* <p>
|
||||
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
|
||||
* BreakIterator from RuleBasedBreakIterator
|
||||
* <p>
|
||||
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
|
||||
* doesn't actually behave as a subclass: it always returns 0 for
|
||||
* getRuleStatus():
|
||||
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
|
||||
* tags
|
||||
* @lucene.experimental
|
||||
*/
|
||||
abstract class BreakIteratorWrapper {
|
||||
protected final CharArrayIterator textIterator = new CharArrayIterator();
|
||||
protected char text[];
|
||||
protected int start;
|
||||
protected int length;
|
||||
|
||||
abstract int next();
|
||||
abstract int current();
|
||||
abstract int getRuleStatus();
|
||||
abstract void setText(CharacterIterator text);
|
||||
|
||||
void setText(char text[], int start, int length) {
|
||||
this.text = text;
|
||||
this.start = start;
|
||||
this.length = length;
|
||||
textIterator.setText(text, start, length);
|
||||
setText(textIterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
|
||||
* treat it like a generic BreakIterator If its any other
|
||||
* RuleBasedBreakIterator, the rule status can be used for token type. If its
|
||||
* any other BreakIterator, the rulestatus method is not available, so treat
|
||||
* it like a generic BreakIterator.
|
||||
*/
|
||||
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
|
||||
if (breakIterator instanceof RuleBasedBreakIterator
|
||||
&& !(breakIterator instanceof DictionaryBasedBreakIterator))
|
||||
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
|
||||
else
|
||||
return new BIWrapper(breakIterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
|
||||
* a DictionaryBasedBreakIterator) behaves correctly.
|
||||
*/
|
||||
static final class RBBIWrapper extends BreakIteratorWrapper {
|
||||
private final RuleBasedBreakIterator rbbi;
|
||||
|
||||
RBBIWrapper(RuleBasedBreakIterator rbbi) {
|
||||
this.rbbi = rbbi;
|
||||
}
|
||||
|
||||
@Override
|
||||
int current() {
|
||||
return rbbi.current();
|
||||
}
|
||||
|
||||
@Override
|
||||
int getRuleStatus() {
|
||||
return rbbi.getRuleStatus();
|
||||
}
|
||||
|
||||
@Override
|
||||
int next() {
|
||||
return rbbi.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
void setText(CharacterIterator text) {
|
||||
rbbi.setText(text);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic BreakIterator wrapper: Either the rulestatus method is not
|
||||
* available or always returns 0. Calculate a rulestatus here so it behaves
|
||||
* like RuleBasedBreakIterator.
|
||||
*
|
||||
* Note: This is slower than RuleBasedBreakIterator.
|
||||
*/
|
||||
static final class BIWrapper extends BreakIteratorWrapper {
|
||||
private final BreakIterator bi;
|
||||
private int status;
|
||||
|
||||
BIWrapper(BreakIterator bi) {
|
||||
this.bi = bi;
|
||||
}
|
||||
|
||||
@Override
|
||||
int current() {
|
||||
return bi.current();
|
||||
}
|
||||
|
||||
@Override
|
||||
int getRuleStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
@Override
|
||||
int next() {
|
||||
int current = bi.current();
|
||||
int next = bi.next();
|
||||
status = calcStatus(current, next);
|
||||
return next;
|
||||
}
|
||||
|
||||
private int calcStatus(int current, int next) {
|
||||
if (current == BreakIterator.DONE || next == BreakIterator.DONE)
|
||||
return RuleBasedBreakIterator.WORD_NONE;
|
||||
|
||||
int begin = start + current;
|
||||
int end = start + next;
|
||||
|
||||
int codepoint;
|
||||
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
|
||||
codepoint = UTF16.charAt(text, 0, end, begin);
|
||||
|
||||
if (UCharacter.isDigit(codepoint))
|
||||
return RuleBasedBreakIterator.WORD_NUMBER;
|
||||
else if (UCharacter.isLetter(codepoint)) {
|
||||
// TODO: try to separately specify ideographic, kana?
|
||||
// [currently all bundled as letter for this case]
|
||||
return RuleBasedBreakIterator.WORD_LETTER;
|
||||
}
|
||||
}
|
||||
|
||||
return RuleBasedBreakIterator.WORD_NONE;
|
||||
}
|
||||
|
||||
@Override
|
||||
void setText(CharacterIterator text) {
|
||||
bi.setText(text);
|
||||
status = RuleBasedBreakIterator.WORD_NONE;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* Wraps a char[] as CharacterIterator for processing with a BreakIterator
|
||||
* @lucene.experimental
|
||||
*/
|
||||
final class CharArrayIterator implements CharacterIterator {
|
||||
private char array[];
|
||||
private int start;
|
||||
private int index;
|
||||
private int length;
|
||||
private int limit;
|
||||
|
||||
public char [] getText() {
|
||||
return array;
|
||||
}
|
||||
|
||||
public int getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a new region of text to be examined by this iterator
|
||||
*
|
||||
* @param array text buffer to examine
|
||||
* @param start offset into buffer
|
||||
* @param length maximum length to examine
|
||||
*/
|
||||
void setText(final char array[], int start, int length) {
|
||||
this.array = array;
|
||||
this.start = start;
|
||||
this.index = start;
|
||||
this.length = length;
|
||||
this.limit = start + length;
|
||||
}
|
||||
|
||||
public char current() {
|
||||
return (index == limit) ? DONE : array[index];
|
||||
}
|
||||
|
||||
public char first() {
|
||||
index = start;
|
||||
return current();
|
||||
}
|
||||
|
||||
public int getBeginIndex() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int getEndIndex() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public int getIndex() {
|
||||
return index - start;
|
||||
}
|
||||
|
||||
public char last() {
|
||||
index = (limit == start) ? limit : limit - 1;
|
||||
return current();
|
||||
}
|
||||
|
||||
public char next() {
|
||||
if (++index >= limit) {
|
||||
index = limit;
|
||||
return DONE;
|
||||
} else {
|
||||
return current();
|
||||
}
|
||||
}
|
||||
|
||||
public char previous() {
|
||||
if (--index < start) {
|
||||
index = start;
|
||||
return DONE;
|
||||
} else {
|
||||
return current();
|
||||
}
|
||||
}
|
||||
|
||||
public char setIndex(int position) {
|
||||
if (position < getBeginIndex() || position > getEndIndex())
|
||||
throw new IllegalArgumentException("Illegal Position: " + position);
|
||||
index = start + position;
|
||||
return current();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
CharArrayIterator clone = new CharArrayIterator();
|
||||
clone.setText(array, start, length);
|
||||
clone.index = index;
|
||||
return clone;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
|
||||
/**
|
||||
* An internal BreakIterator for multilingual text, following recommendations
|
||||
* from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
|
||||
* <p>
|
||||
* See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
|
||||
* design.
|
||||
* <p>
|
||||
* Text is first divided into script boundaries. The processing is then
|
||||
* delegated to the appropriate break iterator for that specific script.
|
||||
* <p>
|
||||
* This break iterator also allows you to retrieve the ISO 15924 script code
|
||||
* associated with a piece of text.
|
||||
* <p>
|
||||
* See also UAX #29, UTR #24
|
||||
* @lucene.experimental
|
||||
*/
|
||||
final class CompositeBreakIterator {
|
||||
private final ICUTokenizerConfig config;
|
||||
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
|
||||
|
||||
private BreakIteratorWrapper rbbi;
|
||||
private final ScriptIterator scriptIterator = new ScriptIterator();
|
||||
|
||||
private char text[];
|
||||
|
||||
CompositeBreakIterator(ICUTokenizerConfig config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the next break position. If the RBBI range is exhausted within the
|
||||
* script boundary, examine the next script boundary.
|
||||
*
|
||||
* @return the next break position or BreakIterator.DONE
|
||||
*/
|
||||
int next() {
|
||||
int next = rbbi.next();
|
||||
while (next == BreakIterator.DONE && scriptIterator.next()) {
|
||||
rbbi = getBreakIterator(scriptIterator.getScriptCode());
|
||||
rbbi.setText(text, scriptIterator.getScriptStart(),
|
||||
scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
|
||||
next = rbbi.next();
|
||||
}
|
||||
return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
|
||||
+ scriptIterator.getScriptStart();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the current break position.
|
||||
*
|
||||
* @return the current break position or BreakIterator.DONE
|
||||
*/
|
||||
int current() {
|
||||
final int current = rbbi.current();
|
||||
return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
|
||||
+ scriptIterator.getScriptStart();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the rule status code (token type) from the underlying break
|
||||
* iterator
|
||||
*
|
||||
* @return rule status code (see RuleBasedBreakIterator constants)
|
||||
*/
|
||||
int getRuleStatus() {
|
||||
return rbbi.getRuleStatus();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the UScript script code for the current token. This code can be
|
||||
* decoded with UScript into a name or ISO 15924 code.
|
||||
*
|
||||
* @return UScript script code for the current token.
|
||||
*/
|
||||
int getScriptCode() {
|
||||
return scriptIterator.getScriptCode();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a new region of text to be examined by this iterator
|
||||
*
|
||||
* @param text buffer of text
|
||||
* @param start offset into buffer
|
||||
* @param length maximum length to examine
|
||||
*/
|
||||
void setText(final char text[], int start, int length) {
|
||||
this.text = text;
|
||||
scriptIterator.setText(text, start, length);
|
||||
if (scriptIterator.next()) {
|
||||
rbbi = getBreakIterator(scriptIterator.getScriptCode());
|
||||
rbbi.setText(text, scriptIterator.getScriptStart(),
|
||||
scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
|
||||
} else {
|
||||
rbbi = getBreakIterator(UScript.COMMON);
|
||||
rbbi.setText(text, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
|
||||
if (wordBreakers[scriptCode] == null)
|
||||
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
|
||||
return wordBreakers[scriptCode];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
* Default {@link ICUTokenizerConfig} that is generally applicable
|
||||
* to many languages.
|
||||
* <p>
|
||||
* Generally tokenizes Unicode text according to UAX#29
|
||||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||
* but with the following tailorings:
|
||||
* <ul>
|
||||
* <li>Thai text is broken into words with a
|
||||
* {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
|
||||
* <li>Lao, Myanmar, and Khmer text is broken into syllables
|
||||
* based on custom BreakIterator rules.
|
||||
* <li>Hebrew text has custom tailorings to handle special cases
|
||||
* involving punctuation.
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||
/** Token type for words containing ideographic characters */
|
||||
public static final String WORD_IDEO = "<IDEO>";
|
||||
/** Token type for words containing Japanese kana */
|
||||
public static final String WORD_KANA = "<KANA>";
|
||||
/** Token type for words that contain letters */
|
||||
public static final String WORD_LETTER = "<WORD>";
|
||||
/** Token type for words that appear to be numbers */
|
||||
public static final String WORD_NUMBER = "<NUM>";
|
||||
|
||||
/*
|
||||
* the default breakiterators in use. these can be expensive to
|
||||
* instantiate, cheap to clone.
|
||||
*/
|
||||
private static final BreakIterator rootBreakIterator =
|
||||
BreakIterator.getWordInstance(ULocale.ROOT);
|
||||
private static final BreakIterator thaiBreakIterator =
|
||||
BreakIterator.getWordInstance(new ULocale("th_TH"));
|
||||
private static final BreakIterator hebrewBreakIterator =
|
||||
readBreakIterator("Hebrew.brk");
|
||||
private static final BreakIterator khmerBreakIterator =
|
||||
readBreakIterator("Khmer.brk");
|
||||
private static final BreakIterator laoBreakIterator =
|
||||
new LaoBreakIterator(readBreakIterator("Lao.brk"));
|
||||
private static final BreakIterator myanmarBreakIterator =
|
||||
readBreakIterator("Myanmar.brk");
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
switch(script) {
|
||||
case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
|
||||
case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
|
||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
||||
case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
|
||||
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
|
||||
default: return (BreakIterator)rootBreakIterator.clone();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getType(int script, int ruleStatus) {
|
||||
switch (ruleStatus) {
|
||||
case RuleBasedBreakIterator.WORD_IDEO:
|
||||
return WORD_IDEO;
|
||||
case RuleBasedBreakIterator.WORD_KANA:
|
||||
return WORD_KANA;
|
||||
case RuleBasedBreakIterator.WORD_LETTER:
|
||||
return WORD_LETTER;
|
||||
case RuleBasedBreakIterator.WORD_NUMBER:
|
||||
return WORD_NUMBER;
|
||||
default: /* some other custom code */
|
||||
return "<OTHER>";
|
||||
}
|
||||
}
|
||||
|
||||
private static RuleBasedBreakIterator readBreakIterator(String filename) {
|
||||
InputStream is =
|
||||
DefaultICUTokenizerConfig.class.getResourceAsStream(filename);
|
||||
try {
|
||||
RuleBasedBreakIterator bi =
|
||||
RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
|
||||
is.close();
|
||||
return bi;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,196 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
|
||||
/**
|
||||
* Breaks text into words according to UAX #29: Unicode Text Segmentation
|
||||
* (http://www.unicode.org/reports/tr29/)
|
||||
* <p>
|
||||
* Words are broken across script boundaries, then segmented according to
|
||||
* the BreakIterator and typing provided by the {@link ICUTokenizerConfig}
|
||||
* </p>
|
||||
* @see ICUTokenizerConfig
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class ICUTokenizer extends Tokenizer {
|
||||
private static final int IOBUFFER = 4096;
|
||||
private final char buffer[] = new char[IOBUFFER];
|
||||
/** true length of text in the buffer */
|
||||
private int length = 0;
|
||||
/** length in buffer that can be evaluated safely, up to a safe end point */
|
||||
private int usableLength = 0;
|
||||
/** accumulated offset of previous buffers for this reader, for offsetAtt */
|
||||
private int offset = 0;
|
||||
|
||||
private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
|
||||
private final ICUTokenizerConfig config;
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);
|
||||
|
||||
/**
|
||||
* Construct a new ICUTokenizer that breaks text into words from the given
|
||||
* Reader.
|
||||
* <p>
|
||||
* The default script-specific handling is used.
|
||||
*
|
||||
* @param input Reader containing text to tokenize.
|
||||
* @see DefaultICUTokenizerConfig
|
||||
*/
|
||||
public ICUTokenizer(Reader input) {
|
||||
this(input, new DefaultICUTokenizerConfig());
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new ICUTokenizer that breaks text into words from the given
|
||||
* Reader, using a tailored BreakIterator configuration.
|
||||
*
|
||||
* @param input Reader containing text to tokenize.
|
||||
* @param config Tailored BreakIterator configuration
|
||||
*/
|
||||
public ICUTokenizer(Reader input, ICUTokenizerConfig config) {
|
||||
super(input);
|
||||
this.config = config;
|
||||
breaker = new CompositeBreakIterator(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
if (length == 0)
|
||||
refill();
|
||||
while (!incrementTokenBuffer()) {
|
||||
refill();
|
||||
if (length <= 0) // no more bytes to read;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
breaker.setText(buffer, 0, 0);
|
||||
length = usableLength = offset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() throws IOException {
|
||||
final int finalOffset = (length < 0) ? offset : offset + length;
|
||||
offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
/*
|
||||
* This tokenizes text based upon the longest matching rule, and because of
|
||||
* this, isn't friendly to a Reader.
|
||||
*
|
||||
* Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
|
||||
* text, the last unambiguous break point is found (in this implementation:
|
||||
* white space character) Any remaining characters represent possible partial
|
||||
* words, so are appended to the front of the next chunk.
|
||||
*
|
||||
* There is the possibility that there are no unambiguous break points within
|
||||
* an entire 4kB chunk of text (binary data). So there is a maximum word limit
|
||||
* of 4kB since it will not try to grow the buffer in this case.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Returns the last unambiguous break position in the text.
|
||||
*
|
||||
* @return position of character, or -1 if one does not exist
|
||||
*/
|
||||
private int findSafeEnd() {
|
||||
for (int i = length - 1; i >= 0; i--)
|
||||
if (UCharacter.isWhitespace(buffer[i]))
|
||||
return i + 1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Refill the buffer, accumulating the offset and setting usableLength to the
|
||||
* last unambiguous break position
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private void refill() throws IOException {
|
||||
offset += usableLength;
|
||||
int leftover = length - usableLength;
|
||||
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
|
||||
int requested = buffer.length - leftover;
|
||||
int returned = input.read(buffer, leftover, requested);
|
||||
length = returned < 0 ? leftover : returned + leftover;
|
||||
if (returned < requested) /* reader has been emptied, process the rest */
|
||||
usableLength = length;
|
||||
else { /* still more data to be read, find a safe-stopping place */
|
||||
usableLength = findSafeEnd();
|
||||
if (usableLength < 0)
|
||||
usableLength = length; /*
|
||||
* more than IOBUFFER of text without space,
|
||||
* gonna possibly truncate tokens
|
||||
*/
|
||||
}
|
||||
|
||||
breaker.setText(buffer, 0, Math.max(0, usableLength));
|
||||
}
|
||||
|
||||
/*
|
||||
* return true if there is a token from the buffer, or null if it is
|
||||
* exhausted.
|
||||
*/
|
||||
private boolean incrementTokenBuffer() {
|
||||
int start = breaker.current();
|
||||
if (start == BreakIterator.DONE)
|
||||
return false; // BreakIterator exhausted
|
||||
|
||||
// find the next set of boundaries, skipping over non-tokens (rule status 0)
|
||||
int end = breaker.next();
|
||||
while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
|
||||
start = end;
|
||||
end = breaker.next();
|
||||
}
|
||||
|
||||
if (start == BreakIterator.DONE)
|
||||
return false; // BreakIterator exhausted
|
||||
|
||||
termAtt.copyBuffer(buffer, start, end - start);
|
||||
offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
|
||||
typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
|
||||
scriptAtt.setCode(breaker.getScriptCode());
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
|
||||
/**
|
||||
* Class that allows for tailored Unicode Text Segmentation on
|
||||
* a per-writing system basis.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class ICUTokenizerConfig {
|
||||
/** Return a breakiterator capable of processing a given script. */
|
||||
public abstract BreakIterator getBreakIterator(int script);
|
||||
/** Return a token type value for a given script and BreakIterator
|
||||
* rule status. */
|
||||
public abstract String getType(int script, int ruleStatus);
|
||||
}
|
|
@ -0,0 +1,226 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Syllable iterator for Lao text.
|
||||
* <p>
|
||||
* This breaks Lao text into syllables according to:
|
||||
* <i>Syllabification of Lao Script for Line Breaking</i>
|
||||
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
|
||||
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
|
||||
* <ul>
|
||||
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
|
||||
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
|
||||
* </ul>
|
||||
* <p>
|
||||
* Most work is accomplished with RBBI rules, however some additional special logic is needed
|
||||
* that cannot be coded in a grammar, and this is implemented here.
|
||||
* <p>
|
||||
* For example, what appears to be a final consonant might instead be part of the next syllable.
|
||||
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
|
||||
* <p>
|
||||
* Take for instance the text ກວ່າດອກ
|
||||
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
|
||||
* What LaoBreakIterator does, according to the paper:
|
||||
* <ol>
|
||||
* <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
|
||||
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
|
||||
* <li>verify the modified current syllable (ດອກ) is now legal.
|
||||
* <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
|
||||
* </ol>
|
||||
* <p>
|
||||
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
|
||||
* This is the issue of combining marks being in the wrong order (typos).
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LaoBreakIterator extends BreakIterator {
|
||||
RuleBasedBreakIterator rules;
|
||||
CharArrayIterator text;
|
||||
|
||||
CharArrayIterator working = new CharArrayIterator();
|
||||
int workingOffset = 0;
|
||||
|
||||
CharArrayIterator verifyText = new CharArrayIterator();
|
||||
RuleBasedBreakIterator verify;
|
||||
|
||||
private static final UnicodeSet laoSet;
|
||||
static {
|
||||
laoSet = new UnicodeSet("[:Lao:]");
|
||||
laoSet.compact();
|
||||
laoSet.freeze();
|
||||
}
|
||||
|
||||
public LaoBreakIterator(RuleBasedBreakIterator rules) {
|
||||
this.rules = (RuleBasedBreakIterator) rules.clone();
|
||||
this.verify = (RuleBasedBreakIterator) rules.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
int current = rules.current();
|
||||
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
rules.setText(working);
|
||||
workingOffset = 0;
|
||||
int first = rules.first();
|
||||
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int offset) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
int current = current();
|
||||
int next = rules.next();
|
||||
if (next == BreakIterator.DONE)
|
||||
return next;
|
||||
else
|
||||
next += workingOffset;
|
||||
|
||||
char c = working.current();
|
||||
int following = rules.next(); // lookahead
|
||||
if (following != BreakIterator.DONE) {
|
||||
following += workingOffset;
|
||||
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
|
||||
workingOffset = next - 1;
|
||||
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
|
||||
return next - 1;
|
||||
}
|
||||
rules.previous(); // undo the lookahead
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
if (n < 0)
|
||||
throw new UnsupportedOperationException("Backwards traversal is unsupported");
|
||||
|
||||
int result = current();
|
||||
while (n > 0) {
|
||||
result = next();
|
||||
--n;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
throw new UnsupportedOperationException("Backwards traversal is unsupported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator text) {
|
||||
if (!(text instanceof CharArrayIterator))
|
||||
throw new UnsupportedOperationException("unsupported CharacterIterator");
|
||||
this.text = (CharArrayIterator) text;
|
||||
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
rules.setText(working);
|
||||
workingOffset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(String newText) {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(newText.toCharArray(), 0, newText.length());
|
||||
setText(ci);
|
||||
}
|
||||
|
||||
private boolean verifyPushBack(int current, int next) {
|
||||
int shortenedSyllable = next - current - 1;
|
||||
|
||||
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
|
||||
verify.setText(verifyText);
|
||||
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
|
||||
return false;
|
||||
|
||||
|
||||
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
|
||||
verify.setText(verifyText);
|
||||
|
||||
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
|
||||
}
|
||||
|
||||
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
|
||||
private void ccReorder(char[] text, int start, int length) {
|
||||
boolean reordered;
|
||||
do {
|
||||
int prevCC = 0;
|
||||
reordered = false;
|
||||
for (int i = start; i < start + length; i++) {
|
||||
final char c = text[i];
|
||||
final int cc = UCharacter.getCombiningClass(c);
|
||||
if (cc > 0 && cc < prevCC) {
|
||||
// swap
|
||||
text[i] = text[i - 1];
|
||||
text[i - 1] = c;
|
||||
reordered = true;
|
||||
} else {
|
||||
prevCC = cc;
|
||||
}
|
||||
}
|
||||
|
||||
} while (reordered == true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone method. Creates another LaoBreakIterator with the same behavior
|
||||
* and current state as this one.
|
||||
* @return The clone.
|
||||
*/
|
||||
@Override
|
||||
public Object clone() {
|
||||
LaoBreakIterator other = (LaoBreakIterator) super.clone();
|
||||
other.rules = (RuleBasedBreakIterator) rules.clone();
|
||||
other.verify = (RuleBasedBreakIterator) verify.clone();
|
||||
if (text != null)
|
||||
other.text = (CharArrayIterator) text.clone();
|
||||
if (working != null)
|
||||
other.working = (CharArrayIterator) working.clone();
|
||||
if (verifyText != null)
|
||||
other.verifyText = (CharArrayIterator) verifyText.clone();
|
||||
return other;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||
* Software, and to permit persons to whom the Software is furnished to do so,
|
||||
* provided that the above copyright notice(s) and this permission notice appear
|
||||
* in all copies of the Software and that both the above copyright notice(s) and
|
||||
* this permission notice appear in supporting documentation.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
|
||||
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
|
||||
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
|
||||
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
||||
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*
|
||||
* Except as contained in this notice, the name of a copyright holder shall not
|
||||
* be used in advertising or otherwise to promote the sale, use or other
|
||||
* dealings in this Software without prior written authorization of the
|
||||
* copyright holder.
|
||||
*/
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* An iterator that locates ISO 15924 script boundaries in text.
|
||||
* <p>
|
||||
* This is not the same as simply looking at the Unicode block, or even the
|
||||
* Script property. Some characters are 'common' across multiple scripts, and
|
||||
* some 'inherit' the script value of text surrounding them.
|
||||
* <p>
|
||||
* This is similar to ICU (internal-only) UScriptRun, with the following
|
||||
* differences:
|
||||
* <ul>
|
||||
* <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
|
||||
* is not necessary. Its also quite expensive.
|
||||
* <li>Non-spacing marks inherit the script of their base character, following
|
||||
* recommendations from UTR #24.
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
final class ScriptIterator {
|
||||
private char text[];
|
||||
private int start;
|
||||
private int limit;
|
||||
private int index;
|
||||
|
||||
private int scriptStart;
|
||||
private int scriptLimit;
|
||||
private int scriptCode;
|
||||
|
||||
/**
|
||||
* Get the start of this script run
|
||||
*
|
||||
* @return start position of script run
|
||||
*/
|
||||
int getScriptStart() {
|
||||
return scriptStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the index of the first character after the end of this script run
|
||||
*
|
||||
* @return position of the first character after this script run
|
||||
*/
|
||||
int getScriptLimit() {
|
||||
return scriptLimit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the UScript script code for this script run
|
||||
*
|
||||
* @return code for the script of the current run
|
||||
*/
|
||||
int getScriptCode() {
|
||||
return scriptCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates to the next script run, returning true if one exists.
|
||||
*
|
||||
* @return true if there is another script run, false otherwise.
|
||||
*/
|
||||
boolean next() {
|
||||
if (scriptLimit >= limit)
|
||||
return false;
|
||||
|
||||
scriptCode = UScript.COMMON;
|
||||
scriptStart = scriptLimit;
|
||||
|
||||
while (index < limit) {
|
||||
final int ch = UTF16.charAt(text, start, limit, index - start);
|
||||
final int sc = getScript(ch);
|
||||
|
||||
/*
|
||||
* From UTR #24: Implementations that determine the boundaries between
|
||||
* characters of given scripts should never break between a non-spacing
|
||||
* mark and its base character. Thus for boundary determinations and
|
||||
* similar sorts of processing, a non-spacing mark — whatever its script
|
||||
* value — should inherit the script value of its base character.
|
||||
*/
|
||||
if (isSameScript(scriptCode, sc)
|
||||
|| UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
|
||||
index += UTF16.getCharCount(ch);
|
||||
|
||||
/*
|
||||
* Inherited or Common becomes the script code of the surrounding text.
|
||||
*/
|
||||
if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
|
||||
scriptCode = sc;
|
||||
}
|
||||
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
scriptLimit = index;
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Determine if two scripts are compatible. */
|
||||
private static boolean isSameScript(int scriptOne, int scriptTwo) {
|
||||
return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
|
||||
|| scriptOne == scriptTwo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a new region of text to be examined by this iterator
|
||||
*
|
||||
* @param text text buffer to examine
|
||||
* @param start offset into buffer
|
||||
* @param length maximum length to examine
|
||||
*/
|
||||
void setText(char text[], int start, int length) {
|
||||
this.text = text;
|
||||
this.start = start;
|
||||
this.index = start;
|
||||
this.limit = start + length;
|
||||
this.scriptStart = start;
|
||||
this.scriptLimit = start;
|
||||
this.scriptCode = UScript.INVALID_CODE;
|
||||
}
|
||||
|
||||
/** linear fast-path for basic latin case */
|
||||
private static final int basicLatin[] = new int[128];
|
||||
|
||||
static {
|
||||
for (int i = 0; i < basicLatin.length; i++)
|
||||
basicLatin[i] = UScript.getScript(i);
|
||||
}
|
||||
|
||||
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
|
||||
private static int getScript(int codepoint) {
|
||||
if (0 <= codepoint && codepoint < basicLatin.length)
|
||||
return basicLatin[codepoint];
|
||||
else
|
||||
return UScript.getScript(codepoint);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm.
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.analysis.icu.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
||||
import com.ibm.icu.lang.UScript; // javadoc @link
|
||||
|
||||
/**
|
||||
* This attribute stores the UTR #24 script value for a token of text.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public interface ScriptAttribute extends Attribute {
|
||||
/**
|
||||
* Get the numeric code for this script value.
|
||||
* This is the constant value from {@link UScript}.
|
||||
* @return numeric code
|
||||
*/
|
||||
public int getCode();
|
||||
/**
|
||||
* Set the numeric code for this script value.
|
||||
* This is the constant value from {@link UScript}.
|
||||
* @param code numeric code
|
||||
*/
|
||||
public void setCode(int code);
|
||||
/**
|
||||
* Get the full name.
|
||||
* @return UTR #24 full name.
|
||||
*/
|
||||
public String getName();
|
||||
/**
|
||||
* Get the abbreviated name.
|
||||
* @return UTR #24 abbreviated name.
|
||||
*/
|
||||
public String getShortName();
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package org.apache.lucene.analysis.icu.tokenattributes;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
/**
|
||||
* Implementation of {@link ScriptAttribute} that stores the script
|
||||
* as an integer.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable, Serializable {
|
||||
private int code = UScript.COMMON;
|
||||
|
||||
public int getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(int code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return UScript.getName(code);
|
||||
}
|
||||
|
||||
public String getShortName() {
|
||||
return UScript.getShortName(code);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
code = UScript.COMMON;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void copyTo(AttributeImpl target) {
|
||||
ScriptAttribute t = (ScriptAttribute) target;
|
||||
t.setCode(code);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (other instanceof ScriptAttributeImpl) {
|
||||
return ((ScriptAttributeImpl) other).code == code;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "script=" + getName();
|
||||
}
|
||||
}
|
|
@ -30,6 +30,8 @@ performance, keeping current with the Unicode Standard, and providing richer
|
|||
APIs. This module exposes the following functionality:
|
||||
</p>
|
||||
<ul>
|
||||
<li><a href="#segmentation">Text Segmentation</a>: Tokenizes text based on
|
||||
properties and rules defined in Unicode.</li>
|
||||
<li><a href="#collation">Collation</a>: Compare strings according to the
|
||||
conventions and standards of a particular language, region or country.</li>
|
||||
<li><a href="#normalization">Normalization</a>: Converts text to a unique,
|
||||
|
@ -42,6 +44,35 @@ APIs. This module exposes the following functionality:
|
|||
a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese</li>
|
||||
</ul>
|
||||
<hr/>
|
||||
<h1><a name="segmentation">Text Segmentation</a></h1>
|
||||
<p>
|
||||
Text Segmentation (Tokenization) divides document and query text into index terms
|
||||
(typically words). Unicode provides special properties and rules so that this can
|
||||
be done in a manner that works well with most languages.
|
||||
</p>
|
||||
<p>
|
||||
Text Segmentation implements the word segmentation specified in
|
||||
<a href="http://unicode.org/reports/tr29/">Unicode Text Segmentation</a>.
|
||||
Additionally the algorithm can be tailored based on writing system, for example
|
||||
text in the Thai script is automatically delegated to a dictionary-based segmentation
|
||||
algorithm.
|
||||
</p>
|
||||
<h2>Use Cases</h2>
|
||||
<ul>
|
||||
<li>
|
||||
As a more thorough replacement for StandardTokenizer that works well for
|
||||
most languages.
|
||||
</li>
|
||||
</ul>
|
||||
<h2>Example Usages</h2>
|
||||
<h3>Tokenizing multilanguage text</h3>
|
||||
<code><pre>
|
||||
/**
|
||||
* This tokenizer will work well in general for most languages.
|
||||
*/
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader);
|
||||
</pre></code>
|
||||
<hr/>
|
||||
<h1><a name="collation">Collation</a></h1>
|
||||
<p>
|
||||
<code>ICUCollationKeyFilter</code>
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,109 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestCharArrayIterator extends LuceneTestCase {
|
||||
public void testBasicUsage() {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||
assertEquals(0, ci.getBeginIndex());
|
||||
assertEquals(7, ci.getEndIndex());
|
||||
assertEquals(0, ci.getIndex());
|
||||
assertEquals('t', ci.current());
|
||||
assertEquals('e', ci.next());
|
||||
assertEquals('g', ci.last());
|
||||
assertEquals('n', ci.previous());
|
||||
assertEquals('t', ci.first());
|
||||
assertEquals(CharacterIterator.DONE, ci.previous());
|
||||
}
|
||||
|
||||
public void testFirst() {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||
ci.next();
|
||||
// Sets the position to getBeginIndex() and returns the character at that position.
|
||||
assertEquals('t', ci.first());
|
||||
assertEquals(ci.getBeginIndex(), ci.getIndex());
|
||||
// or DONE if the text is empty
|
||||
ci.setText(new char[] {}, 0, 0);
|
||||
assertEquals(CharacterIterator.DONE, ci.first());
|
||||
}
|
||||
|
||||
public void testLast() {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||
// Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
|
||||
// and returns the character at that position.
|
||||
assertEquals('g', ci.last());
|
||||
assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
|
||||
// or DONE if the text is empty
|
||||
ci.setText(new char[] {}, 0, 0);
|
||||
assertEquals(CharacterIterator.DONE, ci.last());
|
||||
assertEquals(ci.getEndIndex(), ci.getIndex());
|
||||
}
|
||||
|
||||
public void testCurrent() {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
// Gets the character at the current position (as returned by getIndex()).
|
||||
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||
assertEquals('t', ci.current());
|
||||
ci.last();
|
||||
ci.next();
|
||||
// or DONE if the current position is off the end of the text.
|
||||
assertEquals(CharacterIterator.DONE, ci.current());
|
||||
}
|
||||
|
||||
public void testNext() {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText("te".toCharArray(), 0, 2);
|
||||
// Increments the iterator's index by one and returns the character at the new index.
|
||||
assertEquals('e', ci.next());
|
||||
assertEquals(1, ci.getIndex());
|
||||
// or DONE if the new position is off the end of the text range.
|
||||
assertEquals(CharacterIterator.DONE, ci.next());
|
||||
assertEquals(ci.getEndIndex(), ci.getIndex());
|
||||
}
|
||||
|
||||
public void testSetIndex() {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText("test".toCharArray(), 0, "test".length());
|
||||
try {
|
||||
ci.setIndex(5);
|
||||
fail();
|
||||
} catch (Exception e) {
|
||||
assertTrue(e instanceof IllegalArgumentException);
|
||||
}
|
||||
}
|
||||
|
||||
public void testClone() {
|
||||
char text[] = "testing".toCharArray();
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(text, 0, text.length);
|
||||
ci.next();
|
||||
CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
|
||||
assertEquals(ci.getIndex(), ci2.getIndex());
|
||||
assertEquals(ci.next(), ci2.next());
|
||||
assertEquals(ci.last(), ci2.last());
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,225 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testHugeDoc() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
char whitespace[] = new char[4094];
|
||||
Arrays.fill(whitespace, ' ');
|
||||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
|
||||
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
public void testHugeTerm2() throws IOException {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < 40960; i++) {
|
||||
sb.append('a');
|
||||
}
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
|
||||
char token[] = new char[4096];
|
||||
Arrays.fill(token, 'a');
|
||||
String expectedToken = new String(token);
|
||||
String expected[] = {
|
||||
expectedToken, expectedToken, expectedToken,
|
||||
expectedToken, expectedToken, expectedToken,
|
||||
expectedToken, expectedToken, expectedToken,
|
||||
expectedToken
|
||||
};
|
||||
assertTokenStreamContents(tokenizer, expected);
|
||||
}
|
||||
|
||||
private Analyzer a = new ReusableAnalyzerBase() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader);
|
||||
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
};
|
||||
|
||||
public void testArmenian() throws Exception {
|
||||
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||
new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
|
||||
}
|
||||
|
||||
public void testAmharic() throws Exception {
|
||||
assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
|
||||
new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
|
||||
}
|
||||
|
||||
public void testArabic() throws Exception {
|
||||
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
|
||||
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||
"بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } );
|
||||
}
|
||||
|
||||
public void testAramaic() throws Exception {
|
||||
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
|
||||
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
||||
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
|
||||
}
|
||||
|
||||
public void testBengali() throws Exception {
|
||||
assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
|
||||
new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
|
||||
"শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
|
||||
}
|
||||
|
||||
public void testFarsi() throws Exception {
|
||||
assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
|
||||
new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
|
||||
"برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
|
||||
}
|
||||
|
||||
public void testGreek() throws Exception {
|
||||
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
|
||||
new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
||||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||
}
|
||||
|
||||
public void testLao() throws Exception {
|
||||
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
}
|
||||
|
||||
public void testThai() throws Exception {
|
||||
assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});
|
||||
}
|
||||
|
||||
public void testTibetan() throws Exception {
|
||||
assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
|
||||
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
|
||||
}
|
||||
|
||||
/*
|
||||
* For chinese, tokenize as char (these can later form bigrams or whatever)
|
||||
* TODO: why do full-width numerics have no word-break prop?
|
||||
*/
|
||||
public void testChinese() throws Exception {
|
||||
assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
|
||||
new String[] { "我", "是", "中", "国", "人", "tests"});
|
||||
}
|
||||
|
||||
public void testEmpty() throws Exception {
|
||||
assertAnalyzesTo(a, "", new String[] {});
|
||||
assertAnalyzesTo(a, ".", new String[] {});
|
||||
assertAnalyzesTo(a, " ", new String[] {});
|
||||
}
|
||||
|
||||
/* test various jira issues this analyzer is related to */
|
||||
|
||||
public void testLUCENE1545() throws Exception {
|
||||
/*
|
||||
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
|
||||
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
|
||||
* Expected result is only on token "moͤchte".
|
||||
*/
|
||||
assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
|
||||
}
|
||||
|
||||
/* Tests from StandardAnalyzer, just to show behavior is similar */
|
||||
public void testAlphanumericSA() throws Exception {
|
||||
// alphanumeric tokens
|
||||
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||
assertAnalyzesTo(a, "2B", new String[]{"2b"});
|
||||
}
|
||||
|
||||
public void testDelimitersSA() throws Exception {
|
||||
// other delimiters: "-", "/", ","
|
||||
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
|
||||
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
|
||||
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
|
||||
}
|
||||
|
||||
public void testApostrophesSA() throws Exception {
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||
assertAnalyzesTo(a, "she's", new String[]{"she's"});
|
||||
assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
|
||||
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
|
||||
}
|
||||
|
||||
public void testNumericSA() throws Exception {
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
|
||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||
}
|
||||
|
||||
public void testTextWithNumbersSA() throws Exception {
|
||||
// numbers
|
||||
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
|
||||
}
|
||||
|
||||
public void testVariousTextSA() throws Exception {
|
||||
// various
|
||||
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
|
||||
}
|
||||
|
||||
public void testKoreanSA() throws Exception {
|
||||
// Korean words
|
||||
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
|
||||
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང",
|
||||
"འཕེལ", "དུ", "གཏོང", "བར", "ཧ", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
|
||||
}
|
||||
|
||||
public void testOffsets() throws Exception {
|
||||
assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"david", "has", "5000", "bones"},
|
||||
new int[] {0, 6, 10, 15},
|
||||
new int[] {5, 9, 14, 20});
|
||||
}
|
||||
|
||||
public void testTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "David has 5000 bones",
|
||||
new String[] {"david", "has", "5000", "bones"},
|
||||
new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* Tests LaoBreakIterator and its RBBI rules
|
||||
*/
|
||||
public class TestLaoBreakIterator extends LuceneTestCase {
|
||||
private BreakIterator wordIterator;
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
InputStream is = getClass().getResourceAsStream("Lao.brk");
|
||||
wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
|
||||
is.close();
|
||||
}
|
||||
|
||||
private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
|
||||
char text[] = sourceText.toCharArray();
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(text, 0, text.length);
|
||||
iterator.setText(ci);
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
int start, end;
|
||||
do {
|
||||
start = iterator.current();
|
||||
end = iterator.next();
|
||||
} while (end != BreakIterator.DONE && !isWord(text, start, end));
|
||||
assertTrue(start != BreakIterator.DONE);
|
||||
assertTrue(end != BreakIterator.DONE);
|
||||
assertEquals(tokens[i], new String(text, start, end - start));
|
||||
}
|
||||
|
||||
assertTrue(iterator.next() == BreakIterator.DONE);
|
||||
}
|
||||
|
||||
protected boolean isWord(char text[], int start, int end) {
|
||||
int codepoint;
|
||||
for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
|
||||
codepoint = UTF16.charAt(text, 0, end, start);
|
||||
|
||||
if (UCharacter.isLetterOrDigit(codepoint))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void testBasicUsage() throws Exception {
|
||||
assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
assertBreaksTo(wordIterator, "ຜູ້ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
|
||||
assertBreaksTo(wordIterator, "", new String[] {});
|
||||
assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
|
||||
}
|
||||
|
||||
public void testNumerics() throws Exception {
|
||||
assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
|
||||
assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
|
||||
}
|
||||
|
||||
public void testTextAndNumerics() throws Exception {
|
||||
assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
|
||||
/**
|
||||
* Command-line utility to converts RuleBasedBreakIterator (.rbbi) files into
|
||||
* binary compiled form (.brk).
|
||||
*/
|
||||
public class RBBIRuleCompiler {
|
||||
|
||||
static String getRules(File ruleFile) throws IOException {
|
||||
StringBuilder rules = new StringBuilder();
|
||||
InputStream in = new FileInputStream(ruleFile);
|
||||
BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
|
||||
String line = null;
|
||||
while ((line = cin.readLine()) != null) {
|
||||
if (!line.startsWith("#"))
|
||||
rules.append(line);
|
||||
rules.append('\n');
|
||||
}
|
||||
cin.close();
|
||||
in.close();
|
||||
return rules.toString();
|
||||
}
|
||||
|
||||
static void compile(File srcDir, File destDir) throws Exception {
|
||||
File files[] = srcDir.listFiles(new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.endsWith("rbbi");
|
||||
}});
|
||||
if (files == null) throw new IOException("Path does not exist: " + srcDir);
|
||||
for (int i = 0; i < files.length; i++) {
|
||||
File file = files[i];
|
||||
File outputFile = new File(destDir,
|
||||
file.getName().replaceAll("rbbi$", "brk"));
|
||||
String rules = getRules(file);
|
||||
System.err.print("Compiling " + file.getName() + " to "
|
||||
+ outputFile.getName() + ": ");
|
||||
/*
|
||||
* if there is a syntax error, compileRules() may succeed. the way to
|
||||
* check is to try to instantiate from the string. additionally if the
|
||||
* rules are invalid, you can get a useful syntax error.
|
||||
*/
|
||||
try {
|
||||
new RuleBasedBreakIterator(rules);
|
||||
} catch (IllegalArgumentException e) {
|
||||
/*
|
||||
* do this intentionally, so you don't get a massive stack trace
|
||||
* instead, get a useful syntax error!
|
||||
*/
|
||||
System.err.println(e.getMessage());
|
||||
System.exit(1);
|
||||
}
|
||||
FileOutputStream os = new FileOutputStream(outputFile);
|
||||
// RBBIRuleBuilder.compileRules(rules, os);
|
||||
Class<?> builderClass = Class.forName("com.ibm.icu.text.RBBIRuleBuilder");
|
||||
Method method = builderClass.getDeclaredMethod("compileRules", String.class, OutputStream.class);
|
||||
method.setAccessible(true);
|
||||
method.invoke(null, rules, os);
|
||||
os.close();
|
||||
System.err.println(outputFile.length() + " bytes.");
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: RBBIRuleComputer <sourcedir> <destdir>");
|
||||
System.exit(1);
|
||||
}
|
||||
compile(new File(args[0]), new File(args[1]));
|
||||
System.exit(0);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue