LUCENE-9767: infrastructure for icu regeneration in place. (#2362)

This commit is contained in:
Dawid Weiss 2021-02-14 21:07:39 +01:00 committed by GitHub
parent 97763ad3ce
commit 8f56ae0a4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 215 additions and 11 deletions

View File

@ -154,6 +154,7 @@ apply from: file('gradle/generation/util.gradle')
apply from: file('gradle/generation/snowball.gradle')
apply from: file('gradle/generation/kuromoji.gradle')
apply from: file('gradle/generation/nori.gradle')
apply from: file('gradle/generation/icu.gradle')
// Shared configuration of subprojects containing native code.
apply from: file('gradle/native/disable-native.gradle')

View File

@ -0,0 +1,210 @@
import org.apache.tools.ant.taskdefs.condition.Os
import java.nio.file.Files
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Regenerates ICU data files.
*
* The icu4c version must match exactly the icu4j version in version.props:
* The one on your system is probably different. This script will attempt to
* download and compile a matching icu4c version automatically.
*/
configure(project(":lucene:analysis:icu")) {
def utr30DataDir = file("src/data/utr30")
def icuBuildDir = file("${buildDir}/icu")
def icuBinDir
def gennorm
def icupkg
if (Os.isFamily(Os.FAMILY_WINDOWS)) {
icuBinDir = file("${icuBuildDir}/bin64")
gennorm = file("${icuBinDir}/gennorm2.exe")
icupkg = file("${icuBinDir}/icupkg.exe")
} else {
icuBinDir = file("${icuBuildDir}/icu/source/bin")
gennorm = file("${icuBinDir}/gennorm2")
icupkg = file("${icuBinDir}/icupkg")
}
task genUtr30DataFiles() {
dependsOn Os.isFamily(Os.FAMILY_WINDOWS) ? "compileIcuWindows" : "compileIcuLinux"
// May be undefined yet, so use a provider.
dependsOn { sourceSets.tools.runtimeClasspath }
doFirst {
// all these steps must be done sequentially: it's a pipeline resulting in utr30.nrm
project.javaexec {
main = "org.apache.lucene.analysis.icu.GenerateUTR30DataFiles"
classpath = sourceSets.tools.runtimeClasspath
ignoreExitValue false
workingDir utr30DataDir
}
project.exec {
executable gennorm
ignoreExitValue = false
args = [
"-v",
"-s",
utr30DataDir,
"-o",
"${buildDir}/utr30.tmp",
"nfc.txt", "nfkc.txt", "nfkc_cf.txt", "BasicFoldings.txt",
"DiacriticFolding.txt", "DingbatFolding.txt", "HanRadicalFolding.txt",
"NativeDigitFolding.txt"
]
}
project.exec {
executable icupkg
ignoreExitValue = false
args = [
"-tb",
"${buildDir}/utr30.tmp",
"src/resources/org/apache/lucene/analysis/icu/utr30.nrm"
]
}
}
}
task genRbbi() {
// May be undefined yet, so use a provider.
dependsOn { sourceSets.tools.runtimeClasspath }
doFirst {
project.javaexec {
main = "org.apache.lucene.analysis.icu.RBBIRuleCompiler"
classpath = sourceSets.tools.runtimeClasspath
ignoreExitValue false
enableAssertions true
args = [
"src/data/uax29",
"src/resources/org/apache/lucene/analysis/icu/segmentation"
]
}
}
}
task regenerate() {
dependsOn genUtr30DataFiles
dependsOn genRbbi
}
task compileIcuWindows() {
doFirst {
def v = getVersion('com.ibm.icu', 'icu4j');
def icuBinZip = file("${icuBuildDir}/icu4c-${v.replace(".", "_")}.zip")
if (!icuBinZip.exists()) {
icuBuildDir.mkdirs()
// No official Windows binaries for ICU 62.2... I've checked the script
// against ICU 68.2 (below) and it works but results in a different generated file.
// def src = URI.create("https://github.com/unicode-org/icu/releases/download/release-68-2/icu4c-68_2-Win64-MSVC2019.zip")
def src = URI.create("https://github.com/unicode-org/icu/releases/download/release-${v.replace(".", "-")}/icu4c-${v.replace(".", "_")}-Win64-MSVC2019.zip")
logger.lifecycle("Trying to download binary ICU version: ${v} from:\n ${src}")
Files.write(icuBinZip.toPath(), src.toURL().openStream().bytes)
logger.lifecycle("Downloaded ${icuBinZip.size()} bytes.")
}
// Unzip.
project.copy {
into icuBuildDir
from zipTree(icuBinZip)
}
}
}
task compileIcuLinux() {
doFirst {
if (Os.isFamily(Os.FAMILY_WINDOWS)) {
throw new GradleException("ICU compilation not supported on Windows.")
}
def v = getVersion('com.ibm.icu', 'icu4j');
def icuSrcTgz = file("${icuBuildDir}/icu4c-${v.replace(".", "_")}-src.tgz")
// download version matching icu4j version in version.props
// curl -fLO https://github.com/unicode-org/icu/releases/download/release-62-2/icu4c-62_2-src.tgz
if (!icuSrcTgz.exists()) {
icuBuildDir.mkdirs()
def src = URI.create("https://github.com/unicode-org/icu/releases/download/release-${v.replace(".", "-")}/icu4c-${v.replace(".", "_")}-src.tgz")
logger.lifecycle("Trying to download and compile ICU version: ${v} from:\n ${src}")
Files.write(icuSrcTgz.toPath(), src.toURL().openStream().bytes)
logger.lifecycle("Downloaded ${icuSrcTgz.size()} bytes.")
}
def icuSrcDir = file("${icuBuildDir}/icu/source")
project.delete icuSrcDir
// extract tgz
// tar -zxvf icu4c-62_2-src.tgz
project.exec {
executable "tar"
ignoreExitValue false
workingDir icuBuildDir
args = [
"-zxvf",
icuSrcTgz
]
}
// compile
// (cd icu/source && ./configure --prefix=$(pwd) --enable-rpath && make -j4)
project.exec {
executable "sh"
ignoreExitValue false
workingDir icuSrcDir
args = [
"configure",
"--prefix=${icuSrcDir}",
"--enable-rpath"
]
}
project.exec {
executable "make"
ignoreExitValue false
workingDir icuSrcDir
args = [
"-j4"
]
}
// test that the binaries work
// derb -V
logger.lifecycle("Compiled ICU, checking...")
project.exec {
executable "./derb"
ignoreExitValue false
workingDir icuBinDir
args = [
"-V"
]
}
}
}
}

View File

@ -61,7 +61,7 @@ public class GenerateUTR30DataFiles {
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
private static final String NFKC_CF_TXT = "nfkc_cf.txt";
private static byte[] DOWNLOAD_BUFFER = new byte[8192];
private static final Pattern ROUND_TRIP_MAPPING_LINE_PATTERN =
Pattern.compile("^\\s*([^=]+?)\\s*=\\s*(.*)$");
private static final Pattern VERBATIM_RULE_LINE_PATTERN =
@ -209,16 +209,9 @@ public class GenerateUTR30DataFiles {
private static void download(URL url, String outputFile) throws IOException {
final URLConnection connection = openConnection(url);
final InputStream inputStream = connection.getInputStream();
final OutputStream outputStream = Files.newOutputStream(Path.of(outputFile));
int numBytes;
try {
while (-1 != (numBytes = inputStream.read(DOWNLOAD_BUFFER))) {
outputStream.write(DOWNLOAD_BUFFER, 0, numBytes);
}
} finally {
inputStream.close();
outputStream.close();
try (InputStream inputStream = connection.getInputStream();
OutputStream outputStream = Files.newOutputStream(Path.of(outputFile))) {
inputStream.transferTo(outputStream);
}
}