mirror of https://github.com/apache/lucene.git
LUCENE-2629: fix analysis/icu's gennorm2 task
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@991053 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
86fbce2608
commit
912a6152a8
|
@ -117,6 +117,11 @@ Bug fixes
|
|||
* LUCENE-2615: Fix DirectIOLinuxDirectory to not assign bogus
|
||||
permissions to newly created files, and to not silently hardwire
|
||||
buffer size to 1 MB. (Mark Miller, Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-2629: Fix gennorm2 task for generating ICUFoldingFilter's .nrm file. This allows
|
||||
you to customize its normalization/folding, by editing the source data files in src/data
|
||||
and regenerating a new .nrm with 'ant gennorm2'. (David Bowen via Robert Muir)
|
||||
|
||||
|
||||
API Changes
|
||||
|
||||
|
|
|
@ -65,17 +65,27 @@
|
|||
<property name="gennorm2.src.dir" value="src/data/utr30"/>
|
||||
<property name="gennorm2.src.files"
|
||||
value="nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt HanRadicalFolding.txt NativeDigitFolding.txt"/>
|
||||
<property name="gennorm2.tmp" value="${build.dir}/gennorm2/utr30.tmp"/>
|
||||
<property name="gennorm2.dst" value="src/resources/org/apache/lucene/analysis/icu/utr30.nrm"/>
|
||||
<target name="gennorm2">
|
||||
<echo>Warning: only works on a big-endian platform!</echo>
|
||||
<echo>Note that the gennorm2 and icupkg tools must be on your PATH. These tools
|
||||
are part of the ICU4C package. See http://site.icu-project.org/ </echo>
|
||||
<mkdir dir="${build.dir}/gennorm2"/>
|
||||
<exec executable="gennorm2" failonerror="true">
|
||||
<arg value="-v"/>
|
||||
<arg value="-s"/>
|
||||
<arg value="${gennorm2.src.dir}"/>
|
||||
<arg value="${gennorm2.src.files}"/>
|
||||
<arg line="${gennorm2.src.files}"/>
|
||||
<arg value="-o"/>
|
||||
<arg value="${gennorm2.tmp}"/>
|
||||
</exec>
|
||||
<!-- now convert binary file to big-endian -->
|
||||
<exec executable="icupkg" failonerror="true">
|
||||
<arg value="-tb"/>
|
||||
<arg value="${gennorm2.tmp}"/>
|
||||
<arg value="${gennorm2.dst}"/>
|
||||
</exec>
|
||||
<delete file="${gennorm2.tmp}"/>
|
||||
</target>
|
||||
|
||||
<property name="rbbi.src.dir" location="src/data/uax29"/>
|
||||
|
|
Binary file not shown.
|
@ -68,5 +68,12 @@ public class TestICUFoldingFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
// ascii-folding-filter type stuff
|
||||
assertAnalyzesTo(a, "đis is cræzy", new String[] { "dis", "is", "craezy" });
|
||||
|
||||
// proper downcasing of Turkish dotted-capital I
|
||||
// (according to default case folding rules)
|
||||
assertAnalyzesTo(a, "ELİF", new String[] { "elif" });
|
||||
|
||||
// handling of decomposed combining-dot-above
|
||||
assertAnalyzesTo(a, "eli\u0307f", new String[] { "elif" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue