LUCENE-8752: Add Japanese new imperial era '令和' (Reiwa) to the dictionary used in JapaneseTokenizer

2019-04-13 21:07:27 +09:00 · 2019-04-13 21:07:27 +09:00 · 7619c07d3a
parent 6e28cd60a8
commit 7619c07d3a
7 changed files with 30 additions and 2 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -110,7 +110,7 @@ Improvements

 * LUCENE-8750: Implements setMissingValue() on sort fields produced from 
  DoubleValuesSource and LongValuesSource (Mike Sokolov via Alan Woodward)
-
+  
 * LUCENE-8701: ToParentBlockJoinQuery now creates a child scorer that disallows skipping over
  non-competitive documents if the score of a parent depends on the score of multiple
  children (avg, max, min). Additionally the score mode `none` that assigns a constant score to
@ -119,6 +119,10 @@ Improvements
 * LUCENE-8751: Weight#matches now use the ScorerSupplier to build scorers with a lead cost of 1
  (single document). (Jim Ferenczi)

+* LUCENE-8752: Japanese new era name '令和' (Reiwa) is added to the dictionary used in
+  JapaneseTokenizer so that the analyzer handles the era name correctly.
+  Reiwa is set to replace the Heisei Era on May 1, 2019. (Tomoko Uchida)
+
 Changes in Runtime Behavior

 * LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@ -64,6 +64,11 @@
     <untar src="${build.dir}/${ipadic.version}.tar" dest="${build.dir}"/>
  </target>

+  <target name="patch-dict" depends="download-dict">
+    <patch patchfile="src/tools/patches/Noun.proper.csv.patch"
+           originalfile="${dict.src.dir}/Noun.proper.csv"/>
+  </target>
+
  <path id="tools.dependencies">
    <fileset dir="../icu/lib"/>
  </path>
@ -81,7 +86,7 @@
    <pathelement location="${build.dir}/classes/tools-test"/>
  </path>

-  <target name="build-dict" depends="compile-tools, download-dict">
+  <target name="build-dict" depends="compile-tools, patch-dict">
    <sequential>
      <delete verbose="true">
        <fileset dir="${resources.dir}/org/apache/lucene/analysis/ja/dict" includes="**/*"/>
--- a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat
+++ b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$buffer.dat
--- a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
+++ b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
--- a/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat
+++ b/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$targetMap.dat
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -836,4 +836,16 @@ public class
    tokenizer.reset();
    while (tokenizer.incrementToken());
  }
+
+  public void testPatchedSystemDict() throws Exception {
+    assertAnalyzesTo(analyzer, "令和元年",
+        new String[]{"令和", "元年"},
+        new int[]{0, 2},
+        new int[]{2, 4});
+
+    assertAnalyzesTo(analyzerNormal, "令和元年",
+        new String[]{"令和", "元年"},
+        new int[]{0, 2},
+        new int[]{2, 4});
+  }
 }
--- a/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch
+++ b/lucene/analysis/kuromoji/src/tools/patches/Noun.proper.csv.patch
@ -0,0 +1,7 @@
+--- Noun.proper.csv	2007-07-31 23:50:07.000000000 +0900
+++ Noun.proper.csv.20190403	2019-04-03 15:52:43.793191818 +0900
+@@ -27325,3 +27325,4 @@
+ 桃ノ木鼻,1288,1288,8538,名詞,固有名詞,一般,*,*,*,桃ノ木鼻,モモノキハナ,モモノキハナ
+ ドウ坂,1288,1288,3765,名詞,固有名詞,一般,*,*,*,ドウ坂,ドウザカ,ドーザカ
+ 戸城山,1288,1288,8538,名詞,固有名詞,一般,*,*,*,戸城山,トシロヤマ,トシロヤマ
+令和,1288,1288,5904,名詞,固有名詞,一般,*,*,*,令和,レイワ,レイワ