From 00d7f5ea68d8eaec618e4019714fda02060539a6 Mon Sep 17 00:00:00 2001 From: Namgyu Kim Date: Mon, 28 Sep 2020 20:28:21 +0900 Subject: [PATCH] LUCENE-9544: Port Nori dictionary compilation (#1926) --- build.gradle | 1 + gradle/generation/nori.gradle | 84 +++++++++++++++++++++++++++++++++++ lucene/CHANGES.txt | 2 + 3 files changed, 87 insertions(+) create mode 100644 gradle/generation/nori.gradle diff --git a/build.gradle b/build.gradle index 00e04d3550c..9826b57cfb9 100644 --- a/build.gradle +++ b/build.gradle @@ -150,6 +150,7 @@ apply from: file('gradle/generation/javacc.gradle') apply from: file('gradle/generation/util.gradle') apply from: file('gradle/generation/snowball.gradle') apply from: file('gradle/generation/kuromoji.gradle') +apply from: file('gradle/generation/nori.gradle') // Additional development aids. apply from: file('gradle/maven/maven-local.gradle') diff --git a/gradle/generation/nori.gradle b/gradle/generation/nori.gradle new file mode 100644 index 00000000000..eb6afa153b9 --- /dev/null +++ b/gradle/generation/nori.gradle @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// This downloads and compiles Nori dictionaries. + +def recompileDictionary(project, dictionaryName, Closure closure) { + project.javaexec { + main = "org.apache.lucene.analysis.ko.util.DictionaryBuilder" + classpath = project.sourceSets.main.runtimeClasspath + + jvmArgs '-Xmx1G' + + with closure + } + project.logger.lifecycle("Automaton regenerated from dictionary: ${dictionaryName}") +} + +configure(project(":lucene:analysis:nori")) { + apply plugin: 'java-library' + apply plugin: "de.undercouch.download" + + ext { + targetDir = file("src/resources") + } + + task deleteDictionaryData() { + // There should really be just one but since we don't know which + // one it'll be, let's process all of them. + doFirst { + sourceSets.main.resources.srcDirs.each { location -> + delete fileTree(dir: location, include: "org/apache/lucene/analysis/ko/dict/*.dat") + } + } + } + + task compileMecabKo(type: Download) { + description "Recompile dictionaries from Mecab-Ko data." + group "generation" + + dependsOn deleteDictionaryData + dependsOn sourceSets.main.runtimeClasspath + + def dictionaryName = "mecab-ko-dic-2.0.3-20170922" + def dictionarySource = "https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/${dictionaryName}.tar.gz" + def dictionaryFile = file("${buildDir}/generate/${dictionaryName}.tar.gz") + def unpackedDir = file("${buildDir}/generate/${dictionaryName}") + + src dictionarySource + dest dictionaryFile + onlyIfModified true + + doLast { + // Unpack the downloaded archive. + delete unpackedDir + ant.untar(src: dictionaryFile, dest: unpackedDir, compression: "gzip") { + ant.cutdirsmapper(dirs: "1") + } + + // Compile the dictionary + recompileDictionary(project, dictionaryName, { + args += [ + unpackedDir, + targetDir, + "utf-8", + false + ] + }) + } + } +} diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4fffed00b24..383ac8474a3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -184,6 +184,8 @@ Other * LUCENE-9497: Integrate Error Prone, a static analysis tool during compilation (Dawid Weiss, Varun Thacker) +* LUCENE-9544: add regenerate gradle script for nori dictionary (Namgyu Kim) + ======================= Lucene 8.7.0 ======================= API Changes