LUCENE-2629: fix analysis/icu's gennorm2 task

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@991053 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-08-31 01:33:02 +00:00
parent 86fbce2608
commit 912a6152a8
4 changed files with 24 additions and 2 deletions

View File

@ -117,6 +117,11 @@ Bug fixes
* LUCENE-2615: Fix DirectIOLinuxDirectory to not assign bogus
permissions to newly created files, and to not silently hardwire
buffer size to 1 MB. (Mark Miller, Robert Muir, Mike McCandless)
* LUCENE-2629: Fix gennorm2 task for generating ICUFoldingFilter's .nrm file. This allows
you to customize its normalization/folding, by editing the source data files in src/data
and regenerating a new .nrm with 'ant gennorm2'. (David Bowen via Robert Muir)
API Changes

View File

@ -65,17 +65,27 @@
<property name="gennorm2.src.dir" value="src/data/utr30"/>
<property name="gennorm2.src.files"
value="nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt HanRadicalFolding.txt NativeDigitFolding.txt"/>
<property name="gennorm2.tmp" value="${build.dir}/gennorm2/utr30.tmp"/>
<property name="gennorm2.dst" value="src/resources/org/apache/lucene/analysis/icu/utr30.nrm"/>
<target name="gennorm2">
<echo>Warning: only works on a big-endian platform!</echo>
<echo>Note that the gennorm2 and icupkg tools must be on your PATH. These tools
are part of the ICU4C package. See http://site.icu-project.org/ </echo>
<mkdir dir="${build.dir}/gennorm2"/>
<exec executable="gennorm2" failonerror="true">
<arg value="-v"/>
<arg value="-s"/>
<arg value="${gennorm2.src.dir}"/>
<arg value="${gennorm2.src.files}"/>
<arg line="${gennorm2.src.files}"/>
<arg value="-o"/>
<arg value="${gennorm2.tmp}"/>
</exec>
<!-- now convert binary file to big-endian -->
<exec executable="icupkg" failonerror="true">
<arg value="-tb"/>
<arg value="${gennorm2.tmp}"/>
<arg value="${gennorm2.dst}"/>
</exec>
<delete file="${gennorm2.tmp}"/>
</target>
<property name="rbbi.src.dir" location="src/data/uax29"/>

View File

@ -68,5 +68,12 @@ public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
// ascii-folding-filter type stuff
assertAnalyzesTo(a, "đis is cræzy", new String[] { "dis", "is", "craezy" });
// proper downcasing of Turkish dotted-capital I
// (according to default case folding rules)
assertAnalyzesTo(a, "ELİF", new String[] { "elif" });
// handling of decomposed combining-dot-above
assertAnalyzesTo(a, "eli\u0307f", new String[] { "elif" });
}
}