mirror of https://github.com/apache/lucene.git
LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer
This commit is contained in:
parent
6e28cd60a8
commit
7619c07d3a
|
@ -110,7 +110,7 @@ Improvements
|
|||
|
||||
* LUCENE-8750: Implements setMissingValue() on sort fields produced from
|
||||
DoubleValuesSource and LongValuesSource (Mike Sokolov via Alan Woodward)
|
||||
|
||||
|
||||
* LUCENE-8701: ToParentBlockJoinQuery now creates a child scorer that disallows skipping over
|
||||
non-competitive documents if the score of a parent depends on the score of multiple
|
||||
children (avg, max, min). Additionally the score mode `none` that assigns a constant score to
|
||||
|
@ -119,6 +119,10 @@ Improvements
|
|||
* LUCENE-8751: Weight#matches now use the ScorerSupplier to build scorers with a lead cost of 1
|
||||
(single document). (Jim Ferenczi)
|
||||
|
||||
* LUCENE-8752: Japanese new era name '令和' (Reiwa) is added to the dictionary used in
|
||||
JapaneseTokenizer so that the analyzer handles the era name correctly.
|
||||
Reiwa is set to replace the Heisei Era on May 1, 2019. (Tomoko Uchida)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
|
||||
|
|
|
@ -64,6 +64,11 @@
|
|||
<untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
|
||||
</target>
|
||||
|
||||
<target name="patch-dict" depends="download-dict">
|
||||
<patch patchfile="src/tools/patches/Noun.proper.csv.patch"
|
||||
originalfile="${dict.src.dir}/Noun.proper.csv"/>
|
||||
</target>
|
||||
|
||||
<path id="tools.dependencies">
|
||||
<fileset dir="../icu/lib"/>
|
||||
</path>
|
||||
|
@ -81,7 +86,7 @@
|
|||
<pathelement location="${build.dir}/classes/tools-test"/>
|
||||
</path>
|
||||
|
||||
<target name="build-dict" depends="compile-tools, download-dict">
|
||||
<target name="build-dict" depends="compile-tools, patch-dict">
|
||||
<sequential>
|
||||
<delete verbose="true">
|
||||
<fileset dir="${resources.dir}/org/apache/lucene/analysis/ja/dict" includes="**/*"/>
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -836,4 +836,16 @@ public class
|
|||
tokenizer.reset();
|
||||
while (tokenizer.incrementToken());
|
||||
}
|
||||
|
||||
public void testPatchedSystemDict() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "令和元年",
|
||||
new String[]{"令和", "元年"},
|
||||
new int[]{0, 2},
|
||||
new int[]{2, 4});
|
||||
|
||||
assertAnalyzesTo(analyzerNormal, "令和元年",
|
||||
new String[]{"令和", "元年"},
|
||||
new int[]{0, 2},
|
||||
new int[]{2, 4});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
--- Noun.proper.csv 2007-07-31 23:50:07.000000000 +0900
|
||||
+++ Noun.proper.csv.20190403 2019-04-03 15:52:43.793191818 +0900
|
||||
@@ -27325,3 +27325,4 @@
|
||||
桃ノ木鼻,1288,1288,8538,名詞,固有名詞,一般,*,*,*,桃ノ木鼻,モモノキハナ,モモノキハナ
|
||||
ドウ坂,1288,1288,3765,名詞,固有名詞,一般,*,*,*,ドウ坂,ドウザカ,ドーザカ
|
||||
戸城山,1288,1288,8538,名詞,固有名詞,一般,*,*,*,戸城山,トシロヤマ,トシロヤマ
|
||||
+令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ
|
Loading…
Reference in New Issue