LUCENE-9901: UnicodeData.java has no regeneration task (#63)

This commit is contained in:
Dawid Weiss 2021-04-05 20:12:56 +02:00 committed by GitHub
parent 67a0bd4b6d
commit fbf9191abf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 67 additions and 60 deletions

View File

@ -81,18 +81,6 @@ ext {
minJavaVersion = JavaVersion.VERSION_11
// Declare script dependency versions outside of palantir's
// version unification control. These are not our main dependencies.
scriptDepVersions = [
"apache-rat": "0.11",
"commons-codec": "1.13",
"ecj": "3.25.0",
"javacc": "7.0.4",
"jflex": "1.7.0",
"jgit": "5.9.0.202009080501-r",
"flexmark": "0.61.24",
]
// Allow definiting external tool locations using system props.
externalTool = { name ->
def resolved = propertyOrDefault("${name}.exe", name as String)
@ -101,6 +89,8 @@ ext {
}
}
apply from: file('buildSrc/scriptDepVersions.gradle')
// Include smaller chunks configuring dedicated build areas.
// Some of these intersect or add additional functionality.
// The order of inclusion of these files shouldn't matter (but may
@ -152,6 +142,7 @@ apply from: file('gradle/generation/kuromoji.gradle')
apply from: file('gradle/generation/nori.gradle')
apply from: file('gradle/generation/icu.gradle')
apply from: file('gradle/generation/javacc.gradle')
apply from: file('gradle/generation/unicode-data.gradle')
apply from: file('gradle/datasets/external-datasets.gradle')

View File

@ -15,26 +15,21 @@
* limitations under the License.
*/
// Make sure the build environment is consistent.
apply from: file('../gradle/validation/check-environment.gradle')
repositories {
mavenCentral()
}
ext {
// Declare script dependency versions outside of palantir's
// version unification control. These are not our main dependencies.
scriptDepVersions = [
"commons-codec": "1.13"
]
}
// Make sure the build environment is consistent.
apply from: file('../gradle/validation/check-environment.gradle')
// Load common buildSrc and script deps.
apply from: file("scriptDepVersions.gradle")
dependencies {
implementation gradleApi()
implementation localGroovy()
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
implementation "com.ibm.icu:icu4j:${scriptDepVersions['icu']}"
}

View File

@ -0,0 +1,16 @@
// Declare script dependency versions outside of palantir's
// version unification control. These are not our main dependencies
// but are reused in buildSrc and across applied scripts.
ext {
scriptDepVersions = [
"apache-rat": "0.11",
"commons-codec": "1.13",
"ecj": "3.25.0",
"flexmark": "0.61.24",
"icu": "68.2",
"javacc": "7.0.4",
"jflex": "1.7.0",
"jgit": "5.9.0.202009080501-r",
]
}

View File

@ -1,3 +1,6 @@
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.VersionInfo;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -15,34 +18,36 @@
* limitations under the License.
*/
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.VersionInfo;
// Regenerates UnicodeProps.java
configure(project(":lucene:analysis:common")) {
task generateUnicodeProps() {
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
def linesep = properties['line.separator'];
def icuVersion = VersionInfo.ICU_VERSION.toString()
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
def appendChar = { StringBuilder sb, int c ->
int len = sb.length();
if (len != 0) {
sb.append(', ');
}
if (len == 0 || len - sb.lastIndexOf(linesep) > 100) {
sb.append(linesep).append(' ');
}
sb.append(String.format(Locale.ROOT, "0x%04X", c));
}
inputs.property("icu-version", icuVersion)
inputs.property("unicode-version", unicodeVersion)
outputs.file outputFile
def whitespace = new StringBuilder();
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
if (UCharacter.isUWhiteSpace(c)) {
appendChar(whitespace, c);
}
}
doFirst {
def icuLockDepVersion = getVersion("com.ibm.icu", "icu4j")
def icuScriptDep = scriptDepVersions['icu']
if (icuLockDepVersion != icuScriptDep) {
throw new GradleException("ICU version in build script dependency ${icuScriptDep} and in" +
" project dependency ${icuLockDepVersion} must match.")
}
def icuVersion = VersionInfo.ICU_VERSION.toString();
def unicodeVersion = UCharacter.getUnicodeVersion().toString();
List<String> chars = []
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
if (UCharacter.isUWhiteSpace(c)) {
chars.add(String.format(Locale.ROOT, "0x%04X", c))
}
}
def whitespace = chars.join(", ")
def code = """
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
def code = """
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -68,9 +73,7 @@ import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was created using ICU4J v${icuVersion}
* <p>
* Unicode version: ${unicodeVersion}
* The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
*/
public final class UnicodeProps {
private UnicodeProps() {}
@ -80,7 +83,7 @@ public final class UnicodeProps {
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(${whitespace});
private static Bits createBits(final int... codepoints) {
final int len = codepoints[codepoints.length - 1] + 1;
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
@ -98,9 +101,10 @@ public final class UnicodeProps {
};
}
}
""";
"""
outputFile.setText(code.trim(), "UTF-8")
}
}
File f = new File(properties['unicode-props-file']);
f.write(code.trim(), 'UTF-8');
task.log("Unicode data written to: " + f);
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
}

View File

@ -0,0 +1,3 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java": "7d2cf5f959c2dfc5b83295e359212a1228f761c4"
}

View File

@ -1,4 +1,4 @@
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -24,15 +24,13 @@ import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s. The data was
* created using ICU4J v62.2.0.0
*
* <p>Unicode version: 11.0.0.0
* generated using ICU4J v68.2.0.0, unicode version: 13.0.0.0.
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "11.0.0.0";
public static final String UNICODE_VERSION = "13.0.0.0";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE =