LUCENE-2414: Add ICUTokenizer, tailorable impl of Unicode Text Segmentation

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940447 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-03 13:20:09 +00:00
parent 98c47c57e0
commit 5abbf3429c
26 changed files with 2266 additions and 1 deletions

View File

@ -140,6 +140,11 @@ New features
* LUCENE-2298: Add analyzers/stempel, an algorithmic stemmer with support for
the Polish language. (Andrzej Bialecki via Robert Muir)
* LUCENE-2414: Add ICUTokenizer, a tailorable tokenizer that implements Unicode
Text Segmentation. This tokenizer is useful for documents or collections with
multiple languages. The default configuration includes special support for
Thai, Lao, Myanmar, and Khmer. (Robert Muir, Uwe Schindler)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -43,7 +43,39 @@
<target name="gennorm2">
<echo>Warning: only works on a big-endian platform!</echo>
<exec executable="gennorm2" failonerror="true">
<arg line="-v -s ${gennorm2.src.dir} ${gennorm2.src.files} -o ${gennorm2.dst}"/>
<arg value="-v"/>
<arg value="-s"/>
<arg value="${gennorm2.src.dir}"/>
<arg value="${gennorm2.src.files}"/>
<arg value="-o"/>
<arg value="${gennorm2.dst}"/>
</exec>
</target>
<property name="rbbi.src.dir" location="src/data/uax29"/>
<property name="rbbi.dst.dir" location="src/resources/org/apache/lucene/analysis/icu/segmentation"/>
<target name="genrbbi" depends="compile-tools">
<mkdir dir="${rbbi.dst.dir}"/>
<java
classname="org.apache.lucene.analysis.icu.RBBIRuleCompiler"
dir="."
fork="true"
failonerror="true">
<classpath>
<path refid="additional.dependencies"/>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
<arg value="${rbbi.src.dir}"/>
<arg value="${rbbi.dst.dir}"/>
</java>
</target>
<target name="compile-tools">
<compile
srcdir="src/tools/java"
destdir="${build.dir}/classes/tools">
<classpath refid="classpath"/>
</compile>
</target>
</project>

View File

@ -0,0 +1,61 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This is an example of rule tailoring for Hebrew.
# In this example the single-quote is added to the Extend category
# The double-quote is added to the MidLetter category.
#
!!chain;
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}\u0027];
$Format = [\p{Word_Break = Format}];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}\u0022];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$CR $LF;
[^$CR $LF $Newline]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
$ALetterEx $ALetterEx {200};
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
$NumericEx $NumericEx {100};
$ALetterEx $NumericEx {200};
$NumericEx $ALetterEx {200};
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$ALetterEx $ExtendNumLetEx {200};
$NumericEx $ExtendNumLetEx {100};
$ExtendNumLetEx $ExtendNumLetEx {200};
$ExtendNumLetEx $ALetterEx {200};
$ExtendNumLetEx $NumericEx {100};

View File

@ -0,0 +1,61 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# Parses Khmer text, with orthographic syllable as token.
#
# The definition of Khmer orthographic syllable is taken from the Unicode Standard.
#
# B = base character (consonant, independent vowel, etc)
$KhmerBase = [\u1780-\u17B3];
# R = robat
$KhmerRobat = [\u17CC];
# C = consonant shifter
$KhmerShifter = [\u17C9\u17CA];
# S = subscript consonant or independent vowel sign
$KhmerSub = ([\u17D2] $KhmerBase);
# V = dependent vowel sign
$KhmerVowel = [\u17B4-\u17C5];
# Z = zero-width joiner or non-joiner
$KhmerZWC = [\u200C\u200D];
# O = any other sign
$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
$WordJoin = [:Line_Break=Word_Joiner:];
$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$KhmerJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -0,0 +1,192 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Parses Lao text, with syllable as token.
#
# The definition of Lao syllable is based from:
#
# Syllabification of Lao Script for Line Breaking
# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
#
# NOTE:
# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
#
# Syllable structure, where X is the nuclear consonant:
#
# +----+
# | X5 |
# +----+
# | X4 |
# +----+----+----+----+----+----+----+-----+
# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
# +----+----+----+----+----+----+----+-----+
# | X2 |
# +----+
# | X3 |
# +----+
#
# X0 represents a vowel which occurs before the nuclear consonant.
# It can always define the beginning of syllable.
$X0 = [\u0EC0-\u0EC4];
# X1 is a combination consonant which comes before the nuclear consonant,
# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
$X1 = [\u0EAB];
# X represents the nuclear consonant.
$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
# X2 is a combination consonant which comes after the nuclear consonant,
# which is placed under or next to the nuclear consonant.
$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
# X3 represents a vowel which occurs under the nuclear consonant.
$X3 = [\u0EB8\u0EB9];
# X4 represents a vowel which occurs above the nuclear consonant.
$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
$X5 = [\u0EC8-\u0ECB];
# X6 represents a consonant vowel, which occurs after the nuclear consonant.
# It functions when the syllable doesnt have any vowels. And it always exists with X8.
$X6 = [\u0EA7\u0EAD\u0EBD];
# X7 represents a final vowel.
# However X7_1 always represents the end of syllable and it never exists with tone mark.
$X7 = [\u0EB0\u0EB2\u0EB3];
# X8 represents an alternate consonant.
$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
# X10 represents a sign mark.
# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
$X10 = [\u0EAF\u0EC6\u0ECC];
# Section 1
$X0_1 = [\u0EC0];
$X4_1_2 = [\u0EB4\u0EB5];
$X4_3_4 = [\u0EB6\u0EB7];
$X4_6 = [\u0EBB];
$X4_7 = [\u0EB1];
$X6_2 = [\u0EAD];
$X6_3 = [\u0EBD];
$X7_1 = [\u0EB0];
$X7_2 = [\u0EB2];
$X10_1 = [\u0EAF];
$X10_2 = [\u0EC6];
$X10_3 = [\u0ECC];
$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
# Section 2
$X0_2 = [\u0EC1];
$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
# Section 3
$X0_3 = [\u0EC2];
$X8_3 = [\u0E8D];
$X8_8 = [\u0EA7];
$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
# Section 4
$X0_4 = [\u0EC4];
$X6_1 = [\u0EA7];
$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 5
$X0_5 = [\u0EC3];
$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 6
$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 7
$X4_1_4 = [\u0EB4-\u0EB7];
$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 8
$X4_5 = [\u0ECD];
$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 9
$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
$Rule9 = ($Rule9_1 | $Rule9_2);
# Section 10
$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 11
$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 12
$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
# Section 13
$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 14
$X7_3 = [\u0EB3];
$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
$WordJoin = [:Line_Break=Word_Joiner:];
$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$LaoJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -0,0 +1,50 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# Parses Myanmar text, with syllable as token.
#
$Cons = [[:Other_Letter:]&[:Myanmar:]];
$Virama = [\u1039];
$Asat = [\u103A];
$WordJoin = [:Line_Break=Word_Joiner:];
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$ConsEx = $Cons ($Extend | $Format)*;
$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
!!forward;
$MyanmarJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -0,0 +1,171 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
/**
* Contain all the issues surrounding BreakIterators in ICU in one place.
* Basically this boils down to the fact that they aren't very friendly to any
* sort of OO design.
* <p>
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
* BreakIterator from RuleBasedBreakIterator
* <p>
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
* doesn't actually behave as a subclass: it always returns 0 for
* getRuleStatus():
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
* tags
* @lucene.experimental
*/
abstract class BreakIteratorWrapper {
protected final CharArrayIterator textIterator = new CharArrayIterator();
protected char text[];
protected int start;
protected int length;
abstract int next();
abstract int current();
abstract int getRuleStatus();
abstract void setText(CharacterIterator text);
void setText(char text[], int start, int length) {
this.text = text;
this.start = start;
this.length = length;
textIterator.setText(text, start, length);
setText(textIterator);
}
/**
* If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
* treat it like a generic BreakIterator If its any other
* RuleBasedBreakIterator, the rule status can be used for token type. If its
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
if (breakIterator instanceof RuleBasedBreakIterator
&& !(breakIterator instanceof DictionaryBasedBreakIterator))
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);
}
/**
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
* a DictionaryBasedBreakIterator) behaves correctly.
*/
static final class RBBIWrapper extends BreakIteratorWrapper {
private final RuleBasedBreakIterator rbbi;
RBBIWrapper(RuleBasedBreakIterator rbbi) {
this.rbbi = rbbi;
}
@Override
int current() {
return rbbi.current();
}
@Override
int getRuleStatus() {
return rbbi.getRuleStatus();
}
@Override
int next() {
return rbbi.next();
}
@Override
void setText(CharacterIterator text) {
rbbi.setText(text);
}
}
/**
* Generic BreakIterator wrapper: Either the rulestatus method is not
* available or always returns 0. Calculate a rulestatus here so it behaves
* like RuleBasedBreakIterator.
*
* Note: This is slower than RuleBasedBreakIterator.
*/
static final class BIWrapper extends BreakIteratorWrapper {
private final BreakIterator bi;
private int status;
BIWrapper(BreakIterator bi) {
this.bi = bi;
}
@Override
int current() {
return bi.current();
}
@Override
int getRuleStatus() {
return status;
}
@Override
int next() {
int current = bi.current();
int next = bi.next();
status = calcStatus(current, next);
return next;
}
private int calcStatus(int current, int next) {
if (current == BreakIterator.DONE || next == BreakIterator.DONE)
return RuleBasedBreakIterator.WORD_NONE;
int begin = start + current;
int end = start + next;
int codepoint;
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
codepoint = UTF16.charAt(text, 0, end, begin);
if (UCharacter.isDigit(codepoint))
return RuleBasedBreakIterator.WORD_NUMBER;
else if (UCharacter.isLetter(codepoint)) {
// TODO: try to separately specify ideographic, kana?
// [currently all bundled as letter for this case]
return RuleBasedBreakIterator.WORD_LETTER;
}
}
return RuleBasedBreakIterator.WORD_NONE;
}
@Override
void setText(CharacterIterator text) {
bi.setText(text);
status = RuleBasedBreakIterator.WORD_NONE;
}
}
}

View File

@ -0,0 +1,118 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
/**
* Wraps a char[] as CharacterIterator for processing with a BreakIterator
* @lucene.experimental
*/
final class CharArrayIterator implements CharacterIterator {
private char array[];
private int start;
private int index;
private int length;
private int limit;
public char [] getText() {
return array;
}
public int getStart() {
return start;
}
public int getLength() {
return length;
}
/**
* Set a new region of text to be examined by this iterator
*
* @param array text buffer to examine
* @param start offset into buffer
* @param length maximum length to examine
*/
void setText(final char array[], int start, int length) {
this.array = array;
this.start = start;
this.index = start;
this.length = length;
this.limit = start + length;
}
public char current() {
return (index == limit) ? DONE : array[index];
}
public char first() {
index = start;
return current();
}
public int getBeginIndex() {
return 0;
}
public int getEndIndex() {
return length;
}
public int getIndex() {
return index - start;
}
public char last() {
index = (limit == start) ? limit : limit - 1;
return current();
}
public char next() {
if (++index >= limit) {
index = limit;
return DONE;
} else {
return current();
}
}
public char previous() {
if (--index < start) {
index = start;
return DONE;
} else {
return current();
}
}
public char setIndex(int position) {
if (position < getBeginIndex() || position > getEndIndex())
throw new IllegalArgumentException("Illegal Position: " + position);
index = start + position;
return current();
}
@Override
public Object clone() {
CharArrayIterator clone = new CharArrayIterator();
clone.setText(array, start, length);
clone.index = index;
return clone;
}
}

View File

@ -0,0 +1,126 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
/**
* An internal BreakIterator for multilingual text, following recommendations
* from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
* <p>
* See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
* design.
* <p>
* Text is first divided into script boundaries. The processing is then
* delegated to the appropriate break iterator for that specific script.
* <p>
* This break iterator also allows you to retrieve the ISO 15924 script code
* associated with a piece of text.
* <p>
* See also UAX #29, UTR #24
* @lucene.experimental
*/
final class CompositeBreakIterator {
private final ICUTokenizerConfig config;
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
private BreakIteratorWrapper rbbi;
private final ScriptIterator scriptIterator = new ScriptIterator();
private char text[];
CompositeBreakIterator(ICUTokenizerConfig config) {
this.config = config;
}
/**
* Retrieve the next break position. If the RBBI range is exhausted within the
* script boundary, examine the next script boundary.
*
* @return the next break position or BreakIterator.DONE
*/
int next() {
int next = rbbi.next();
while (next == BreakIterator.DONE && scriptIterator.next()) {
rbbi = getBreakIterator(scriptIterator.getScriptCode());
rbbi.setText(text, scriptIterator.getScriptStart(),
scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
next = rbbi.next();
}
return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
+ scriptIterator.getScriptStart();
}
/**
* Retrieve the current break position.
*
* @return the current break position or BreakIterator.DONE
*/
int current() {
final int current = rbbi.current();
return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
+ scriptIterator.getScriptStart();
}
/**
* Retrieve the rule status code (token type) from the underlying break
* iterator
*
* @return rule status code (see RuleBasedBreakIterator constants)
*/
int getRuleStatus() {
return rbbi.getRuleStatus();
}
/**
* Retrieve the UScript script code for the current token. This code can be
* decoded with UScript into a name or ISO 15924 code.
*
* @return UScript script code for the current token.
*/
int getScriptCode() {
return scriptIterator.getScriptCode();
}
/**
* Set a new region of text to be examined by this iterator
*
* @param text buffer of text
* @param start offset into buffer
* @param length maximum length to examine
*/
void setText(final char text[], int start, int length) {
this.text = text;
scriptIterator.setText(text, start, length);
if (scriptIterator.next()) {
rbbi = getBreakIterator(scriptIterator.getScriptCode());
rbbi.setText(text, scriptIterator.getScriptStart(),
scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
} else {
rbbi = getBreakIterator(UScript.COMMON);
rbbi.setText(text, 0, 0);
}
}
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
if (wordBreakers[scriptCode] == null)
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
return wordBreakers[scriptCode];
}
}

View File

@ -0,0 +1,112 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
/**
* Default {@link ICUTokenizerConfig} that is generally applicable
* to many languages.
* <p>
* Generally tokenizes Unicode text according to UAX#29
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings:
* <ul>
* <li>Thai text is broken into words with a
* {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
* <li>Lao, Myanmar, and Khmer text is broken into syllables
* based on custom BreakIterator rules.
* <li>Hebrew text has custom tailorings to handle special cases
* involving punctuation.
* </ul>
* @lucene.experimental
*/
public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
/** Token type for words containing ideographic characters */
public static final String WORD_IDEO = "<IDEO>";
/** Token type for words containing Japanese kana */
public static final String WORD_KANA = "<KANA>";
/** Token type for words that contain letters */
public static final String WORD_LETTER = "<WORD>";
/** Token type for words that appear to be numbers */
public static final String WORD_NUMBER = "<NUM>";
/*
* the default breakiterators in use. these can be expensive to
* instantiate, cheap to clone.
*/
private static final BreakIterator rootBreakIterator =
BreakIterator.getWordInstance(ULocale.ROOT);
private static final BreakIterator thaiBreakIterator =
BreakIterator.getWordInstance(new ULocale("th_TH"));
private static final BreakIterator hebrewBreakIterator =
readBreakIterator("Hebrew.brk");
private static final BreakIterator khmerBreakIterator =
readBreakIterator("Khmer.brk");
private static final BreakIterator laoBreakIterator =
new LaoBreakIterator(readBreakIterator("Lao.brk"));
private static final BreakIterator myanmarBreakIterator =
readBreakIterator("Myanmar.brk");
@Override
public BreakIterator getBreakIterator(int script) {
switch(script) {
case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
default: return (BreakIterator)rootBreakIterator.clone();
}
}
@Override
public String getType(int script, int ruleStatus) {
switch (ruleStatus) {
case RuleBasedBreakIterator.WORD_IDEO:
return WORD_IDEO;
case RuleBasedBreakIterator.WORD_KANA:
return WORD_KANA;
case RuleBasedBreakIterator.WORD_LETTER:
return WORD_LETTER;
case RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
default: /* some other custom code */
return "<OTHER>";
}
}
private static RuleBasedBreakIterator readBreakIterator(String filename) {
InputStream is =
DefaultICUTokenizerConfig.class.getResourceAsStream(filename);
try {
RuleBasedBreakIterator bi =
RuleBasedBreakIterator.getInstanceFromCompiledRules(is);
is.close();
return bi;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,196 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
/**
* Breaks text into words according to UAX #29: Unicode Text Segmentation
* (http://www.unicode.org/reports/tr29/)
* <p>
* Words are broken across script boundaries, then segmented according to
* the BreakIterator and typing provided by the {@link ICUTokenizerConfig}
* </p>
* @see ICUTokenizerConfig
* @lucene.experimental
*/
public final class ICUTokenizer extends Tokenizer {
private static final int IOBUFFER = 4096;
private final char buffer[] = new char[IOBUFFER];
/** true length of text in the buffer */
private int length = 0;
/** length in buffer that can be evaluated safely, up to a safe end point */
private int usableLength = 0;
/** accumulated offset of previous buffers for this reader, for offsetAtt */
private int offset = 0;
private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
private final ICUTokenizerConfig config;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);
/**
* Construct a new ICUTokenizer that breaks text into words from the given
* Reader.
* <p>
* The default script-specific handling is used.
*
* @param input Reader containing text to tokenize.
* @see DefaultICUTokenizerConfig
*/
public ICUTokenizer(Reader input) {
this(input, new DefaultICUTokenizerConfig());
}
/**
* Construct a new ICUTokenizer that breaks text into words from the given
* Reader, using a tailored BreakIterator configuration.
*
* @param input Reader containing text to tokenize.
* @param config Tailored BreakIterator configuration
*/
public ICUTokenizer(Reader input, ICUTokenizerConfig config) {
super(input);
this.config = config;
breaker = new CompositeBreakIterator(config);
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
if (length == 0)
refill();
while (!incrementTokenBuffer()) {
refill();
if (length <= 0) // no more bytes to read;
return false;
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
breaker.setText(buffer, 0, 0);
length = usableLength = offset = 0;
}
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
reset();
}
@Override
public void end() throws IOException {
final int finalOffset = (length < 0) ? offset : offset + length;
offsetAtt.setOffset(finalOffset, finalOffset);
}
/*
* This tokenizes text based upon the longest matching rule, and because of
* this, isn't friendly to a Reader.
*
* Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
* text, the last unambiguous break point is found (in this implementation:
* white space character) Any remaining characters represent possible partial
* words, so are appended to the front of the next chunk.
*
* There is the possibility that there are no unambiguous break points within
* an entire 4kB chunk of text (binary data). So there is a maximum word limit
* of 4kB since it will not try to grow the buffer in this case.
*/
/**
* Returns the last unambiguous break position in the text.
*
* @return position of character, or -1 if one does not exist
*/
private int findSafeEnd() {
for (int i = length - 1; i >= 0; i--)
if (UCharacter.isWhitespace(buffer[i]))
return i + 1;
return -1;
}
/**
* Refill the buffer, accumulating the offset and setting usableLength to the
* last unambiguous break position
*
* @throws IOException
*/
private void refill() throws IOException {
offset += usableLength;
int leftover = length - usableLength;
System.arraycopy(buffer, usableLength, buffer, 0, leftover);
int requested = buffer.length - leftover;
int returned = input.read(buffer, leftover, requested);
length = returned < 0 ? leftover : returned + leftover;
if (returned < requested) /* reader has been emptied, process the rest */
usableLength = length;
else { /* still more data to be read, find a safe-stopping place */
usableLength = findSafeEnd();
if (usableLength < 0)
usableLength = length; /*
* more than IOBUFFER of text without space,
* gonna possibly truncate tokens
*/
}
breaker.setText(buffer, 0, Math.max(0, usableLength));
}
/*
* return true if there is a token from the buffer, or null if it is
* exhausted.
*/
private boolean incrementTokenBuffer() {
int start = breaker.current();
if (start == BreakIterator.DONE)
return false; // BreakIterator exhausted
// find the next set of boundaries, skipping over non-tokens (rule status 0)
int end = breaker.next();
while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
start = end;
end = breaker.next();
}
if (start == BreakIterator.DONE)
return false; // BreakIterator exhausted
termAtt.copyBuffer(buffer, start, end - start);
offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
scriptAtt.setCode(breaker.getScriptCode());
return true;
}
}

View File

@ -0,0 +1,33 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.ibm.icu.text.BreakIterator;
/**
* Class that allows for tailored Unicode Text Segmentation on
* a per-writing system basis.
* @lucene.experimental
*/
public abstract class ICUTokenizerConfig {
/** Return a breakiterator capable of processing a given script. */
public abstract BreakIterator getBreakIterator(int script);
/** Return a token type value for a given script and BreakIterator
* rule status. */
public abstract String getType(int script, int ruleStatus);
}

View File

@ -0,0 +1,226 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
/**
* Syllable iterator for Lao text.
* <p>
* This breaks Lao text into syllables according to:
* <i>Syllabification of Lao Script for Line Breaking</i>
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
* <ul>
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
* </ul>
* <p>
* Most work is accomplished with RBBI rules, however some additional special logic is needed
* that cannot be coded in a grammar, and this is implemented here.
* <p>
* For example, what appears to be a final consonant might instead be part of the next syllable.
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
* <p>
* Take for instance the text ກວ່າດອກ
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
* What LaoBreakIterator does, according to the paper:
* <ol>
* <li>backtrack and remove the from the last syllable, placing it on the current syllable.
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
* <li>verify the modified current syllable (ດອກ) is now legal.
* <li>If 2 or 3 fails, then restore the to the last syllable and skip the current character.
* </ol>
* <p>
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
* This is the issue of combining marks being in the wrong order (typos).
* @lucene.experimental
*/
public class LaoBreakIterator extends BreakIterator {
RuleBasedBreakIterator rules;
CharArrayIterator text;
CharArrayIterator working = new CharArrayIterator();
int workingOffset = 0;
CharArrayIterator verifyText = new CharArrayIterator();
RuleBasedBreakIterator verify;
private static final UnicodeSet laoSet;
static {
laoSet = new UnicodeSet("[:Lao:]");
laoSet.compact();
laoSet.freeze();
}
public LaoBreakIterator(RuleBasedBreakIterator rules) {
this.rules = (RuleBasedBreakIterator) rules.clone();
this.verify = (RuleBasedBreakIterator) rules.clone();
}
@Override
public int current() {
int current = rules.current();
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
}
@Override
public int first() {
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
int first = rules.first();
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
}
@Override
public int following(int offset) {
throw new UnsupportedOperationException();
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public int last() {
throw new UnsupportedOperationException();
}
@Override
public int next() {
int current = current();
int next = rules.next();
if (next == BreakIterator.DONE)
return next;
else
next += workingOffset;
char c = working.current();
int following = rules.next(); // lookahead
if (following != BreakIterator.DONE) {
following += workingOffset;
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
workingOffset = next - 1;
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
return next - 1;
}
rules.previous(); // undo the lookahead
}
return next;
}
@Override
public int next(int n) {
if (n < 0)
throw new UnsupportedOperationException("Backwards traversal is unsupported");
int result = current();
while (n > 0) {
result = next();
--n;
}
return result;
}
@Override
public int previous() {
throw new UnsupportedOperationException("Backwards traversal is unsupported");
}
@Override
public void setText(CharacterIterator text) {
if (!(text instanceof CharArrayIterator))
throw new UnsupportedOperationException("unsupported CharacterIterator");
this.text = (CharArrayIterator) text;
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
}
@Override
public void setText(String newText) {
CharArrayIterator ci = new CharArrayIterator();
ci.setText(newText.toCharArray(), 0, newText.length());
setText(ci);
}
private boolean verifyPushBack(int current, int next) {
int shortenedSyllable = next - current - 1;
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
verify.setText(verifyText);
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
return false;
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
verify.setText(verifyText);
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
}
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
private void ccReorder(char[] text, int start, int length) {
boolean reordered;
do {
int prevCC = 0;
reordered = false;
for (int i = start; i < start + length; i++) {
final char c = text[i];
final int cc = UCharacter.getCombiningClass(c);
if (cc > 0 && cc < prevCC) {
// swap
text[i] = text[i - 1];
text[i - 1] = c;
reordered = true;
} else {
prevCC = cc;
}
}
} while (reordered == true);
}
/**
* Clone method. Creates another LaoBreakIterator with the same behavior
* and current state as this one.
* @return The clone.
*/
@Override
public Object clone() {
LaoBreakIterator other = (LaoBreakIterator) super.clone();
other.rules = (RuleBasedBreakIterator) rules.clone();
other.verify = (RuleBasedBreakIterator) verify.clone();
if (text != null)
other.text = (CharArrayIterator) text.clone();
if (working != null)
other.working = (CharArrayIterator) working.clone();
if (verifyText != null)
other.verifyText = (CharArrayIterator) verifyText.clone();
return other;
}
}

View File

@ -0,0 +1,170 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, and/or sell copies of the
* Software, and to permit persons to whom the Software is furnished to do so,
* provided that the above copyright notice(s) and this permission notice appear
* in all copies of the Software and that both the above copyright notice(s) and
* this permission notice appear in supporting documentation.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
* LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
* ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
* IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder shall not
* be used in advertising or otherwise to promote the sale, use or other
* dealings in this Software without prior written authorization of the
* copyright holder.
*/
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UTF16;
/**
* An iterator that locates ISO 15924 script boundaries in text.
* <p>
* This is not the same as simply looking at the Unicode block, or even the
* Script property. Some characters are 'common' across multiple scripts, and
* some 'inherit' the script value of text surrounding them.
* <p>
* This is similar to ICU (internal-only) UScriptRun, with the following
* differences:
* <ul>
* <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
* is not necessary. Its also quite expensive.
* <li>Non-spacing marks inherit the script of their base character, following
* recommendations from UTR #24.
* </ul>
* @lucene.experimental
*/
final class ScriptIterator {
private char text[];
private int start;
private int limit;
private int index;
private int scriptStart;
private int scriptLimit;
private int scriptCode;
/**
* Get the start of this script run
*
* @return start position of script run
*/
int getScriptStart() {
return scriptStart;
}
/**
* Get the index of the first character after the end of this script run
*
* @return position of the first character after this script run
*/
int getScriptLimit() {
return scriptLimit;
}
/**
* Get the UScript script code for this script run
*
* @return code for the script of the current run
*/
int getScriptCode() {
return scriptCode;
}
/**
* Iterates to the next script run, returning true if one exists.
*
* @return true if there is another script run, false otherwise.
*/
boolean next() {
if (scriptLimit >= limit)
return false;
scriptCode = UScript.COMMON;
scriptStart = scriptLimit;
while (index < limit) {
final int ch = UTF16.charAt(text, start, limit, index - start);
final int sc = getScript(ch);
/*
* From UTR #24: Implementations that determine the boundaries between
* characters of given scripts should never break between a non-spacing
* mark and its base character. Thus for boundary determinations and
* similar sorts of processing, a non-spacing mark whatever its script
* value should inherit the script value of its base character.
*/
if (isSameScript(scriptCode, sc)
|| UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
index += UTF16.getCharCount(ch);
/*
* Inherited or Common becomes the script code of the surrounding text.
*/
if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
scriptCode = sc;
}
} else {
break;
}
}
scriptLimit = index;
return true;
}
/** Determine if two scripts are compatible. */
private static boolean isSameScript(int scriptOne, int scriptTwo) {
return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
|| scriptOne == scriptTwo;
}
/**
* Set a new region of text to be examined by this iterator
*
* @param text text buffer to examine
* @param start offset into buffer
* @param length maximum length to examine
*/
void setText(char text[], int start, int length) {
this.text = text;
this.start = start;
this.index = start;
this.limit = start + length;
this.scriptStart = start;
this.scriptLimit = start;
this.scriptCode = UScript.INVALID_CODE;
}
/** linear fast-path for basic latin case */
private static final int basicLatin[] = new int[128];
static {
for (int i = 0; i < basicLatin.length; i++)
basicLatin[i] = UScript.getScript(i);
}
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private static int getScript(int codepoint) {
if (0 <= codepoint && codepoint < basicLatin.length)
return basicLatin[codepoint];
else
return UScript.getScript(codepoint);
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Tokenizer that breaks text into words with the Unicode Text Segmentation algorithm.
</body>
</html>

View File

@ -0,0 +1,51 @@
package org.apache.lucene.analysis.icu.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Attribute;
import com.ibm.icu.lang.UScript; // javadoc @link
/**
* This attribute stores the UTR #24 script value for a token of text.
* @lucene.experimental
*/
public interface ScriptAttribute extends Attribute {
/**
* Get the numeric code for this script value.
* This is the constant value from {@link UScript}.
* @return numeric code
*/
public int getCode();
/**
* Set the numeric code for this script value.
* This is the constant value from {@link UScript}.
* @param code numeric code
*/
public void setCode(int code);
/**
* Get the full name.
* @return UTR #24 full name.
*/
public String getName();
/**
* Get the abbreviated name.
* @return UTR #24 abbreviated name.
*/
public String getShortName();
}

View File

@ -0,0 +1,83 @@
package org.apache.lucene.analysis.icu.tokenattributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Serializable;
import org.apache.lucene.util.AttributeImpl;
import com.ibm.icu.lang.UScript;
/**
* Implementation of {@link ScriptAttribute} that stores the script
* as an integer.
* @lucene.experimental
*/
public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable, Serializable {
private int code = UScript.COMMON;
public int getCode() {
return code;
}
public void setCode(int code) {
this.code = code;
}
public String getName() {
return UScript.getName(code);
}
public String getShortName() {
return UScript.getShortName(code);
}
@Override
public void clear() {
code = UScript.COMMON;
}
@Override
public void copyTo(AttributeImpl target) {
ScriptAttribute t = (ScriptAttribute) target;
t.setCode(code);
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (other instanceof ScriptAttributeImpl) {
return ((ScriptAttributeImpl) other).code == code;
}
return false;
}
@Override
public int hashCode() {
return code;
}
@Override
public String toString() {
return "script=" + getName();
}
}

View File

@ -30,6 +30,8 @@ performance, keeping current with the Unicode Standard, and providing richer
APIs. This module exposes the following functionality:
</p>
<ul>
<li><a href="#segmentation">Text Segmentation</a>: Tokenizes text based on
properties and rules defined in Unicode.</li>
<li><a href="#collation">Collation</a>: Compare strings according to the
conventions and standards of a particular language, region or country.</li>
<li><a href="#normalization">Normalization</a>: Converts text to a unique,
@ -42,6 +44,35 @@ APIs. This module exposes the following functionality:
a context-sensitive fashion: e.g. mapping Traditional to Simplified Chinese</li>
</ul>
<hr/>
<h1><a name="segmentation">Text Segmentation</a></h1>
<p>
Text Segmentation (Tokenization) divides document and query text into index terms
(typically words). Unicode provides special properties and rules so that this can
be done in a manner that works well with most languages.
</p>
<p>
Text Segmentation implements the word segmentation specified in
<a href="http://unicode.org/reports/tr29/">Unicode Text Segmentation</a>.
Additionally the algorithm can be tailored based on writing system, for example
text in the Thai script is automatically delegated to a dictionary-based segmentation
algorithm.
</p>
<h2>Use Cases</h2>
<ul>
<li>
As a more thorough replacement for StandardTokenizer that works well for
most languages.
</li>
</ul>
<h2>Example Usages</h2>
<h3>Tokenizing multilanguage text</h3>
<code><pre>
/**
* This tokenizer will work well in general for most languages.
*/
Tokenizer tokenizer = new ICUTokenizer(reader);
</pre></code>
<hr/>
<h1><a name="collation">Collation</a></h1>
<p>
<code>ICUCollationKeyFilter</code>

View File

@ -0,0 +1,109 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
import org.apache.lucene.util.LuceneTestCase;
public class TestCharArrayIterator extends LuceneTestCase {
public void testBasicUsage() {
CharArrayIterator ci = new CharArrayIterator();
ci.setText("testing".toCharArray(), 0, "testing".length());
assertEquals(0, ci.getBeginIndex());
assertEquals(7, ci.getEndIndex());
assertEquals(0, ci.getIndex());
assertEquals('t', ci.current());
assertEquals('e', ci.next());
assertEquals('g', ci.last());
assertEquals('n', ci.previous());
assertEquals('t', ci.first());
assertEquals(CharacterIterator.DONE, ci.previous());
}
public void testFirst() {
CharArrayIterator ci = new CharArrayIterator();
ci.setText("testing".toCharArray(), 0, "testing".length());
ci.next();
// Sets the position to getBeginIndex() and returns the character at that position.
assertEquals('t', ci.first());
assertEquals(ci.getBeginIndex(), ci.getIndex());
// or DONE if the text is empty
ci.setText(new char[] {}, 0, 0);
assertEquals(CharacterIterator.DONE, ci.first());
}
public void testLast() {
CharArrayIterator ci = new CharArrayIterator();
ci.setText("testing".toCharArray(), 0, "testing".length());
// Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
// and returns the character at that position.
assertEquals('g', ci.last());
assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
// or DONE if the text is empty
ci.setText(new char[] {}, 0, 0);
assertEquals(CharacterIterator.DONE, ci.last());
assertEquals(ci.getEndIndex(), ci.getIndex());
}
public void testCurrent() {
CharArrayIterator ci = new CharArrayIterator();
// Gets the character at the current position (as returned by getIndex()).
ci.setText("testing".toCharArray(), 0, "testing".length());
assertEquals('t', ci.current());
ci.last();
ci.next();
// or DONE if the current position is off the end of the text.
assertEquals(CharacterIterator.DONE, ci.current());
}
public void testNext() {
CharArrayIterator ci = new CharArrayIterator();
ci.setText("te".toCharArray(), 0, 2);
// Increments the iterator's index by one and returns the character at the new index.
assertEquals('e', ci.next());
assertEquals(1, ci.getIndex());
// or DONE if the new position is off the end of the text range.
assertEquals(CharacterIterator.DONE, ci.next());
assertEquals(ci.getEndIndex(), ci.getIndex());
}
public void testSetIndex() {
CharArrayIterator ci = new CharArrayIterator();
ci.setText("test".toCharArray(), 0, "test".length());
try {
ci.setIndex(5);
fail();
} catch (Exception e) {
assertTrue(e instanceof IllegalArgumentException);
}
}
public void testClone() {
char text[] = "testing".toCharArray();
CharArrayIterator ci = new CharArrayIterator();
ci.setText(text, 0, text.length);
ci.next();
CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
assertEquals(ci.getIndex(), ci2.getIndex());
assertEquals(ci.next(), ci2.next());
assertEquals(ci.last(), ci2.last());
}
}

View File

@ -0,0 +1,225 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import java.util.Arrays;
public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char whitespace[] = new char[4094];
Arrays.fill(whitespace, ' ');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
public void testHugeTerm2() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 40960; i++) {
sb.append('a');
}
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
char token[] = new char[4096];
Arrays.fill(token, 'a');
String expectedToken = new String(token);
String expected[] = {
expectedToken, expectedToken, expectedToken,
expectedToken, expectedToken, expectedToken,
expectedToken, expectedToken, expectedToken,
expectedToken
};
assertTokenStreamContents(tokenizer, expected);
}
private Analyzer a = new ReusableAnalyzerBase() {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer tokenizer = new ICUTokenizer(reader);
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
public void testArmenian() throws Exception {
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
}
public void testAmharic() throws Exception {
assertAnalyzesTo(a, "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት (ኢንሳይክሎፒዲያ) ነው። ማንኛውም",
new String[] { "ዊኪፔድያ", "የባለ", "ብዙ", "ቋንቋ", "የተሟላ", "ትክክለኛና", "ነጻ", "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", "ነው", "ማንኛውም" } );
}
public void testArabic() throws Exception {
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } );
}
public void testAramaic() throws Exception {
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
}
public void testBengali() throws Exception {
assertAnalyzesTo(a, "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক সংস্থা)। উইকিপিডিয়ার শুরু ১৫ জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া রয়েছে।",
new String[] { "এই", "বিশ্বকোষ", "পরিচালনা", "করে", "উইকিমিডিয়া", "ফাউন্ডেশন", "একটি", "অলাভজনক", "সংস্থা", "উইকিপিডিয়ার",
"শুরু", "১৫", "জানুয়ারি", "২০০১", "সালে", "এখন", "পর্যন্ত", "২০০টিরও", "বেশী", "ভাষায়", "উইকিপিডিয়া", "রয়েছে" });
}
public void testFarsi() throws Exception {
assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
"برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
}
public void testGreek() throws Exception {
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
}
public void testLao() throws Exception {
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
}
public void testThai() throws Exception {
assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "แล้ว", "เธอ", "จะ", "ไป", "ไหน", "๑๒๓๔"});
}
public void testTibetan() throws Exception {
assertAnalyzesTo(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང", "འཕེལ", "དུ", "གཏོང", "བར", "", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
}
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
* TODO: why do full-width numerics have no word-break prop?
*/
public void testChinese() throws Exception {
assertAnalyzesTo(a, "我是中国人。 ",
new String[] { "", "", "", "", "", "tests"});
}
public void testEmpty() throws Exception {
assertAnalyzesTo(a, "", new String[] {});
assertAnalyzesTo(a, ".", new String[] {});
assertAnalyzesTo(a, " ", new String[] {});
}
/* test various jira issues this analyzer is related to */
public void testLUCENE1545() throws Exception {
/*
* Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
* The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
* Expected result is only on token "moͤchte".
*/
assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" });
}
/* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception {
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
assertAnalyzesTo(a, "2B", new String[]{"2b"});
}
public void testDelimitersSA() throws Exception {
// other delimiters: "-", "/", ","
assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}
public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she's"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
}
public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}
public void testTextWithNumbersSA() throws Exception {
// numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
}
public void testVariousTextSA() throws Exception {
// various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
}
public void testKoreanSA() throws Exception {
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
public void testReusableTokenStream() throws Exception {
assertAnalyzesToReuse(a, "སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། །",
new String[] { "སྣོན", "མཛོད", "དང", "ལས", "འདིས", "བོད", "ཡིག", "མི", "ཉམས", "གོང",
"འཕེལ", "དུ", "གཏོང", "བར", "", "ཅང", "དགེ", "མཚན", "མཆིས", "སོ" });
}
public void testOffsets() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"},
new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20});
}
public void testTypes() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"},
new String[] { "<WORD>", "<WORD>", "<NUM>", "<WORD>" });
}
}

View File

@ -0,0 +1,90 @@
package org.apache.lucene.analysis.icu.segmentation;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStream;
import org.apache.lucene.util.LuceneTestCase;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
/**
* Tests LaoBreakIterator and its RBBI rules
*/
public class TestLaoBreakIterator extends LuceneTestCase {
private BreakIterator wordIterator;
@Override
protected void setUp() throws Exception {
super.setUp();
InputStream is = getClass().getResourceAsStream("Lao.brk");
wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
is.close();
}
private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
char text[] = sourceText.toCharArray();
CharArrayIterator ci = new CharArrayIterator();
ci.setText(text, 0, text.length);
iterator.setText(ci);
for (int i = 0; i < tokens.length; i++) {
int start, end;
do {
start = iterator.current();
end = iterator.next();
} while (end != BreakIterator.DONE && !isWord(text, start, end));
assertTrue(start != BreakIterator.DONE);
assertTrue(end != BreakIterator.DONE);
assertEquals(tokens[i], new String(text, start, end - start));
}
assertTrue(iterator.next() == BreakIterator.DONE);
}
protected boolean isWord(char text[], int start, int end) {
int codepoint;
for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
codepoint = UTF16.charAt(text, 0, end, start);
if (UCharacter.isLetterOrDigit(codepoint))
return true;
}
return false;
}
public void testBasicUsage() throws Exception {
assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertBreaksTo(wordIterator, "ຜູ້​ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
assertBreaksTo(wordIterator, "", new String[] {});
assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
}
public void testNumerics() throws Exception {
assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
}
public void testTextAndNumerics() throws Exception {
assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
}
}

View File

@ -0,0 +1,101 @@
package org.apache.lucene.analysis.icu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.lang.reflect.Method;
import com.ibm.icu.text.RuleBasedBreakIterator;
/**
* Command-line utility to converts RuleBasedBreakIterator (.rbbi) files into
* binary compiled form (.brk).
*/
public class RBBIRuleCompiler {
static String getRules(File ruleFile) throws IOException {
StringBuilder rules = new StringBuilder();
InputStream in = new FileInputStream(ruleFile);
BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8"));
String line = null;
while ((line = cin.readLine()) != null) {
if (!line.startsWith("#"))
rules.append(line);
rules.append('\n');
}
cin.close();
in.close();
return rules.toString();
}
static void compile(File srcDir, File destDir) throws Exception {
File files[] = srcDir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith("rbbi");
}});
if (files == null) throw new IOException("Path does not exist: " + srcDir);
for (int i = 0; i < files.length; i++) {
File file = files[i];
File outputFile = new File(destDir,
file.getName().replaceAll("rbbi$", "brk"));
String rules = getRules(file);
System.err.print("Compiling " + file.getName() + " to "
+ outputFile.getName() + ": ");
/*
* if there is a syntax error, compileRules() may succeed. the way to
* check is to try to instantiate from the string. additionally if the
* rules are invalid, you can get a useful syntax error.
*/
try {
new RuleBasedBreakIterator(rules);
} catch (IllegalArgumentException e) {
/*
* do this intentionally, so you don't get a massive stack trace
* instead, get a useful syntax error!
*/
System.err.println(e.getMessage());
System.exit(1);
}
FileOutputStream os = new FileOutputStream(outputFile);
// RBBIRuleBuilder.compileRules(rules, os);
Class<?> builderClass = Class.forName("com.ibm.icu.text.RBBIRuleBuilder");
Method method = builderClass.getDeclaredMethod("compileRules", String.class, OutputStream.class);
method.setAccessible(true);
method.invoke(null, rules, os);
os.close();
System.err.println(outputFile.length() + " bytes.");
}
}
public static void main(String args[]) throws Exception {
if (args.length < 2) {
System.err.println("Usage: RBBIRuleComputer <sourcedir> <destdir>");
System.exit(1);
}
compile(new File(args[0]), new File(args[1]));
System.exit(0);
}
}