mirror of https://github.com/apache/lucene.git
LUCENE-9901: UnicodeData.java has no regeneration task (#63)
This commit is contained in:
parent
67a0bd4b6d
commit
fbf9191abf
15
build.gradle
15
build.gradle
|
@ -81,18 +81,6 @@ ext {
|
|||
|
||||
minJavaVersion = JavaVersion.VERSION_11
|
||||
|
||||
// Declare script dependency versions outside of palantir's
|
||||
// version unification control. These are not our main dependencies.
|
||||
scriptDepVersions = [
|
||||
"apache-rat": "0.11",
|
||||
"commons-codec": "1.13",
|
||||
"ecj": "3.25.0",
|
||||
"javacc": "7.0.4",
|
||||
"jflex": "1.7.0",
|
||||
"jgit": "5.9.0.202009080501-r",
|
||||
"flexmark": "0.61.24",
|
||||
]
|
||||
|
||||
// Allow definiting external tool locations using system props.
|
||||
externalTool = { name ->
|
||||
def resolved = propertyOrDefault("${name}.exe", name as String)
|
||||
|
@ -101,6 +89,8 @@ ext {
|
|||
}
|
||||
}
|
||||
|
||||
apply from: file('buildSrc/scriptDepVersions.gradle')
|
||||
|
||||
// Include smaller chunks configuring dedicated build areas.
|
||||
// Some of these intersect or add additional functionality.
|
||||
// The order of inclusion of these files shouldn't matter (but may
|
||||
|
@ -152,6 +142,7 @@ apply from: file('gradle/generation/kuromoji.gradle')
|
|||
apply from: file('gradle/generation/nori.gradle')
|
||||
apply from: file('gradle/generation/icu.gradle')
|
||||
apply from: file('gradle/generation/javacc.gradle')
|
||||
apply from: file('gradle/generation/unicode-data.gradle')
|
||||
|
||||
apply from: file('gradle/datasets/external-datasets.gradle')
|
||||
|
||||
|
|
|
@ -15,26 +15,21 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
// Make sure the build environment is consistent.
|
||||
apply from: file('../gradle/validation/check-environment.gradle')
|
||||
|
||||
repositories {
|
||||
mavenCentral()
|
||||
}
|
||||
|
||||
ext {
|
||||
// Declare script dependency versions outside of palantir's
|
||||
// version unification control. These are not our main dependencies.
|
||||
scriptDepVersions = [
|
||||
"commons-codec": "1.13"
|
||||
]
|
||||
}
|
||||
// Make sure the build environment is consistent.
|
||||
apply from: file('../gradle/validation/check-environment.gradle')
|
||||
|
||||
// Load common buildSrc and script deps.
|
||||
apply from: file("scriptDepVersions.gradle")
|
||||
|
||||
dependencies {
|
||||
implementation gradleApi()
|
||||
implementation localGroovy()
|
||||
|
||||
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
|
||||
implementation "com.ibm.icu:icu4j:${scriptDepVersions['icu']}"
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
// Declare script dependency versions outside of palantir's
|
||||
// version unification control. These are not our main dependencies
|
||||
// but are reused in buildSrc and across applied scripts.
|
||||
|
||||
ext {
|
||||
scriptDepVersions = [
|
||||
"apache-rat": "0.11",
|
||||
"commons-codec": "1.13",
|
||||
"ecj": "3.25.0",
|
||||
"flexmark": "0.61.24",
|
||||
"icu": "68.2",
|
||||
"javacc": "7.0.4",
|
||||
"jflex": "1.7.0",
|
||||
"jgit": "5.9.0.202009080501-r",
|
||||
]
|
||||
}
|
|
@ -1,3 +1,6 @@
|
|||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -15,34 +18,36 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
// Regenerates UnicodeProps.java
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
task generateUnicodeProps() {
|
||||
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
|
||||
|
||||
def linesep = properties['line.separator'];
|
||||
def icuVersion = VersionInfo.ICU_VERSION.toString()
|
||||
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
|
||||
|
||||
def appendChar = { StringBuilder sb, int c ->
|
||||
int len = sb.length();
|
||||
if (len != 0) {
|
||||
sb.append(', ');
|
||||
}
|
||||
if (len == 0 || len - sb.lastIndexOf(linesep) > 100) {
|
||||
sb.append(linesep).append(' ');
|
||||
}
|
||||
sb.append(String.format(Locale.ROOT, "0x%04X", c));
|
||||
}
|
||||
inputs.property("icu-version", icuVersion)
|
||||
inputs.property("unicode-version", unicodeVersion)
|
||||
outputs.file outputFile
|
||||
|
||||
def whitespace = new StringBuilder();
|
||||
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
|
||||
if (UCharacter.isUWhiteSpace(c)) {
|
||||
appendChar(whitespace, c);
|
||||
}
|
||||
}
|
||||
doFirst {
|
||||
def icuLockDepVersion = getVersion("com.ibm.icu", "icu4j")
|
||||
def icuScriptDep = scriptDepVersions['icu']
|
||||
if (icuLockDepVersion != icuScriptDep) {
|
||||
throw new GradleException("ICU version in build script dependency ${icuScriptDep} and in" +
|
||||
" project dependency ${icuLockDepVersion} must match.")
|
||||
}
|
||||
|
||||
def icuVersion = VersionInfo.ICU_VERSION.toString();
|
||||
def unicodeVersion = UCharacter.getUnicodeVersion().toString();
|
||||
List<String> chars = []
|
||||
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
|
||||
if (UCharacter.isUWhiteSpace(c)) {
|
||||
chars.add(String.format(Locale.ROOT, "0x%04X", c))
|
||||
}
|
||||
}
|
||||
def whitespace = chars.join(", ")
|
||||
|
||||
def code = """
|
||||
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
|
||||
def code = """
|
||||
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -68,9 +73,7 @@ import org.apache.lucene.util.SparseFixedBitSet;
|
|||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was created using ICU4J v${icuVersion}
|
||||
* <p>
|
||||
* Unicode version: ${unicodeVersion}
|
||||
* The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
@ -80,7 +83,7 @@ public final class UnicodeProps {
|
|||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(${whitespace});
|
||||
|
||||
|
||||
private static Bits createBits(final int... codepoints) {
|
||||
final int len = codepoints[codepoints.length - 1] + 1;
|
||||
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
|
||||
|
@ -98,9 +101,10 @@ public final class UnicodeProps {
|
|||
};
|
||||
}
|
||||
}
|
||||
""";
|
||||
"""
|
||||
outputFile.setText(code.trim(), "UTF-8")
|
||||
}
|
||||
}
|
||||
|
||||
File f = new File(properties['unicode-props-file']);
|
||||
f.write(code.trim(), 'UTF-8');
|
||||
|
||||
task.log("Unicode data written to: " + f);
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java": "7d2cf5f959c2dfc5b83295e359212a1228f761c4"
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
// DO NOT EDIT THIS FILE! Use "ant unicode-data" to recreate.
|
||||
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -24,15 +24,13 @@ import org.apache.lucene.util.SparseFixedBitSet;
|
|||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s. The data was
|
||||
* created using ICU4J v62.2.0.0
|
||||
*
|
||||
* <p>Unicode version: 11.0.0.0
|
||||
* generated using ICU4J v68.2.0.0, unicode version: 13.0.0.0.
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "11.0.0.0";
|
||||
public static final String UNICODE_VERSION = "13.0.0.0";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE =
|
||||
|
|
Loading…
Reference in New Issue