LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori

This commit is contained in:
Tomoko Uchida 2022-02-20 21:39:03 +09:00
parent e7a29c4c4c
commit 76c9fd4e38
11 changed files with 7 additions and 4 deletions

View File

@ -202,11 +202,11 @@ Nori Korean Morphological Analyzer - Apache Lucene Integration
This software includes a binary and/or source version of data from
mecab-ko-dic-2.0.3-20170922
mecab-ko-dic-2.1.1-20180720
which can be obtained from
https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz
https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz
The floating point precision conversion in NumericUtils.Float16Converter is derived from work by
Jeroen van der Zijp, granted for use under the Apache license.

View File

@ -54,7 +54,7 @@ configure(project(":lucene:analysis:nori")) {
dependsOn deleteDictionaryData
dependsOn sourceSets.main.runtimeClasspath
def dictionaryName = "mecab-ko-dic-2.0.3-20170922"
def dictionaryName = "mecab-ko-dic-2.1.1-20180720"
def dictionarySource = "https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/${dictionaryName}.tar.gz"
def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.tar.gz")
def unpackedDir = file("${buildDir}/generate/${dictionaryName}")

View File

@ -189,6 +189,9 @@ Improvements
* LUCENE-10371: Make IndexRearranger able to arrange segment in a determined order.
(Patrick Zhai)
* LUCENE-10416: Update Korean Dictionary to mecab-ko-dic-2.1.1-20180720 for Nori.
(Uihyun Kim)
Optimizations
---------------------

View File

@ -28,7 +28,7 @@ import java.util.List;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1798,3559,3677,SY,*,*,*,*,*,*,*";
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1801,3559,3677,SY,*,*,*,*,*,*,*";
private String encoding;