2021-04-19 07:37:47 -04:00
|
|
|
import java.nio.file.Files
|
|
|
|
|
2020-01-27 12:05:34 -05:00
|
|
|
/*
|
|
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
|
|
* contributor license agreements. See the NOTICE file distributed with
|
|
|
|
* this work for additional information regarding copyright ownership.
|
|
|
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
|
|
* (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
*
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
*
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2020-01-26 13:44:18 -05:00
|
|
|
// Add a top-level pseudo-task to which we will attach individual regenerate tasks.
|
|
|
|
|
|
|
|
configure(rootProject) {
|
|
|
|
configurations {
|
|
|
|
jflex
|
|
|
|
}
|
|
|
|
|
|
|
|
dependencies {
|
|
|
|
jflex "de.jflex:jflex:${scriptDepVersions['jflex']}"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-02 03:56:47 -04:00
|
|
|
def resources = scriptResources(buildscript)
|
|
|
|
def skeletonDefault = file("${resources}/skeleton.default.txt")
|
|
|
|
def skeletonNoBufferExpansion = file("${resources}/skeleton.disable.buffer.expansion.txt")
|
2020-01-26 13:44:18 -05:00
|
|
|
|
|
|
|
configure(project(":lucene:core")) {
|
2021-04-16 16:35:51 -04:00
|
|
|
task generateStandardTokenizerInternal(type: JFlexTask) {
|
2020-01-26 13:44:18 -05:00
|
|
|
description "Regenerate StandardTokenizerImpl.java"
|
|
|
|
group "generation"
|
|
|
|
|
|
|
|
jflexFile = file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex')
|
2021-04-02 03:56:47 -04:00
|
|
|
skeleton = skeletonNoBufferExpansion
|
2020-01-26 13:44:18 -05:00
|
|
|
|
2021-05-02 13:17:18 -04:00
|
|
|
// Add included files as inputs.
|
|
|
|
inputs.file project(":lucene:core").file('src/data/jflex/UnicodeEmojiProperties.jflex')
|
|
|
|
|
2020-01-26 13:44:18 -05:00
|
|
|
doLast {
|
|
|
|
ant.replace(
|
|
|
|
file: file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java'),
|
|
|
|
encoding: "UTF-8",
|
|
|
|
token: "private static final int ZZ_BUFFERSIZE =",
|
|
|
|
value: "private int ZZ_BUFFERSIZE ="
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
2021-03-23 04:25:53 -04:00
|
|
|
|
2021-05-02 13:17:18 -04:00
|
|
|
def generateStandardTokenizer = wrapWithPersistentChecksums(generateStandardTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
|
|
|
configure(generateStandardTokenizer) {
|
|
|
|
// StandardTokenizerImpl.jflex includes UnicodeEmojiProperties.jflex so we make sure it's up to date.
|
|
|
|
dependsOn ":lucene:core:generateEmojiProperties"
|
|
|
|
}
|
|
|
|
|
|
|
|
regenerate.dependsOn generateStandardTokenizer
|
2020-01-26 13:44:18 -05:00
|
|
|
}
|
2020-01-27 06:36:13 -05:00
|
|
|
|
|
|
|
configure(project(":lucene:analysis:common")) {
|
2021-04-16 16:35:51 -04:00
|
|
|
task generateTldsInternal() {
|
2021-04-11 11:25:15 -04:00
|
|
|
def tldZones = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
2021-04-07 04:56:21 -04:00
|
|
|
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
|
|
|
|
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
|
|
|
|
|
|
|
|
description "Regenerate top-level domain jflex macros and tests"
|
|
|
|
group "generation"
|
|
|
|
|
|
|
|
dependsOn { sourceSets.tools.runtimeClasspath }
|
|
|
|
|
2021-04-19 07:37:47 -04:00
|
|
|
inputs.property "tldZones", tldZones
|
2021-04-07 04:56:21 -04:00
|
|
|
outputs.files jflexMacro, tldList
|
|
|
|
|
|
|
|
doFirst {
|
2021-04-19 07:37:47 -04:00
|
|
|
File tmpJflexMacro = File.createTempFile(jflexMacro.getName(), ".tmp", getTemporaryDir())
|
|
|
|
File tmpTldList = File.createTempFile(tldList.getName(), ".tmp", getTemporaryDir())
|
|
|
|
|
2021-04-07 04:56:21 -04:00
|
|
|
project.javaexec {
|
|
|
|
main = "org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
|
|
|
classpath = sourceSets.tools.runtimeClasspath
|
|
|
|
|
|
|
|
ignoreExitValue false
|
|
|
|
args = [
|
|
|
|
tldZones,
|
2021-04-19 07:37:47 -04:00
|
|
|
tmpJflexMacro,
|
|
|
|
tmpTldList
|
2021-04-07 04:56:21 -04:00
|
|
|
]
|
|
|
|
}
|
|
|
|
|
2021-04-19 07:37:47 -04:00
|
|
|
// LUCENE-9926: tldZones is regenerated daily. Compare the generated content (excluding comments) so that
|
|
|
|
// we only update actual output files if non-comments have changed.
|
|
|
|
def contentLines = { File file ->
|
|
|
|
if (file.exists()) {
|
|
|
|
List<String> lines = file.readLines("UTF-8")
|
|
|
|
lines.removeIf { line -> line.isBlank() || line.startsWith("//") }
|
|
|
|
return lines
|
|
|
|
} else {
|
|
|
|
return []
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (contentLines(tmpTldList).equals(contentLines(tldList))) {
|
|
|
|
logger.lifecycle("Generated TLD content identical as before, not updating.")
|
|
|
|
} else {
|
|
|
|
tldList.setBytes tmpTldList.bytes
|
|
|
|
jflexMacro.setBytes tmpJflexMacro.bytes
|
|
|
|
logger.lifecycle("You've regenerated the TLD include file, remember to regenerate UAX29URLEmailTokenizerImpl too.")
|
|
|
|
}
|
2021-04-07 04:56:21 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-16 16:35:51 -04:00
|
|
|
task generateWikipediaTokenizerInternal(type: JFlexTask) {
|
2020-01-27 06:36:13 -05:00
|
|
|
description "Regenerate WikipediaTokenizerImpl.java"
|
|
|
|
group "generation"
|
2020-12-28 06:26:13 -05:00
|
|
|
|
2020-09-28 03:49:28 -04:00
|
|
|
jflexFile = file('src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex')
|
2021-04-02 03:56:47 -04:00
|
|
|
skeleton = skeletonDefault
|
2020-12-28 06:26:13 -05:00
|
|
|
}
|
|
|
|
|
2021-04-16 16:35:51 -04:00
|
|
|
task generateClassicTokenizerInternal(type: JFlexTask) {
|
2020-01-27 06:36:13 -05:00
|
|
|
description "Regenerate ClassicTokenizerImpl.java"
|
|
|
|
group "generation"
|
2020-12-28 06:26:13 -05:00
|
|
|
|
2020-09-28 03:49:28 -04:00
|
|
|
jflexFile = file('src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex')
|
2021-04-02 03:56:47 -04:00
|
|
|
skeleton = skeletonDefault
|
2020-12-28 06:26:13 -05:00
|
|
|
}
|
|
|
|
|
2021-04-16 16:35:51 -04:00
|
|
|
task generateUAX29URLEmailTokenizerInternal(type: JFlexTask) {
|
2020-01-27 06:36:13 -05:00
|
|
|
description "Regenerate UAX29URLEmailTokenizerImpl.java"
|
|
|
|
group "generation"
|
|
|
|
|
2020-09-28 03:49:28 -04:00
|
|
|
jflexFile = file('src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex')
|
2021-04-02 03:56:47 -04:00
|
|
|
skeleton = skeletonNoBufferExpansion
|
2020-01-27 06:36:13 -05:00
|
|
|
heapSize = "12g"
|
|
|
|
|
2021-05-02 13:17:18 -04:00
|
|
|
// Add included files as inputs.
|
2021-04-07 04:56:21 -04:00
|
|
|
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
|
2021-05-02 13:17:18 -04:00
|
|
|
inputs.file project(":lucene:core").file('src/data/jflex/UnicodeEmojiProperties.jflex')
|
2021-04-07 04:56:21 -04:00
|
|
|
|
2020-01-27 06:36:13 -05:00
|
|
|
doFirst {
|
2021-03-24 10:38:34 -04:00
|
|
|
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
|
2020-01-27 06:36:13 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
doLast {
|
|
|
|
ant.replace(
|
2020-09-28 03:49:28 -04:00
|
|
|
file: file('src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java'),
|
2020-01-27 06:36:13 -05:00
|
|
|
encoding: "UTF-8",
|
|
|
|
token: "private static final int ZZ_BUFFERSIZE =",
|
|
|
|
value: "private int ZZ_BUFFERSIZE ="
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
2020-01-30 07:45:15 -05:00
|
|
|
|
2021-05-02 13:17:18 -04:00
|
|
|
def generateUAX29URLEmailTokenizer = wrapWithPersistentChecksums(generateUAX29URLEmailTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
|
|
|
configure (generateUAX29URLEmailTokenizer) {
|
|
|
|
// UAX29URLEmailTokenizerImpl.jflex includes: UnicodeEmojiProperties.jflex and ASCIITLD.jflex
|
|
|
|
// so we make sure both are up to date.
|
|
|
|
dependsOn ":lucene:core:generateEmojiProperties", "generateTlds"
|
|
|
|
}
|
2020-01-30 07:45:15 -05:00
|
|
|
|
2021-05-02 13:17:18 -04:00
|
|
|
task generateHTMLCharacterEntitiesInternal() {
|
|
|
|
def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
|
|
|
|
def script = file("${resources}/htmlentity.py")
|
|
|
|
|
|
|
|
outputs.files target
|
|
|
|
inputs.file script
|
2020-01-30 07:45:15 -05:00
|
|
|
|
|
|
|
doFirst {
|
2021-03-30 08:38:13 -04:00
|
|
|
quietExec {
|
|
|
|
executable = project.externalTool("python3")
|
|
|
|
workingDir = target.parentFile
|
|
|
|
args += [
|
|
|
|
"-B", // don't write any bytecode cache
|
2021-05-02 13:17:18 -04:00
|
|
|
script,
|
|
|
|
target
|
2021-03-30 08:38:13 -04:00
|
|
|
]
|
2020-01-30 07:45:15 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
project.ant.fixcrlf(
|
|
|
|
file: target,
|
|
|
|
encoding: "UTF-8",
|
|
|
|
eol: "lf"
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
2020-12-28 06:26:13 -05:00
|
|
|
|
2021-05-02 13:17:18 -04:00
|
|
|
task generateHTMLStripCharFilterInternal(type: JFlexTask) {
|
|
|
|
description "Regenerate HTMLStripCharFilter.java"
|
|
|
|
group "generation"
|
|
|
|
|
|
|
|
// Add included files as inputs.
|
|
|
|
inputs.file file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
|
|
|
|
|
|
|
|
jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
|
|
|
|
skeleton = skeletonDefault
|
|
|
|
}
|
|
|
|
|
|
|
|
def generateHTMLStripCharFilter = wrapWithPersistentChecksums(generateHTMLStripCharFilterInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
|
|
|
configure(generateHTMLStripCharFilter) {
|
|
|
|
// HTMLStripCharFilter.jflex includes HTMLCharacterEntities.jflex so we make sure it's up to date.
|
|
|
|
dependsOn "generateHTMLCharacterEntities"
|
|
|
|
}
|
|
|
|
|
|
|
|
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
|
|
|
|
wrapWithPersistentChecksums(generateClassicTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
|
|
|
|
generateUAX29URLEmailTokenizer,
|
|
|
|
wrapWithPersistentChecksums(generateHTMLCharacterEntitiesInternal),
|
|
|
|
generateHTMLStripCharFilter,
|
|
|
|
wrapWithPersistentChecksums(generateTldsInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
2021-04-02 03:56:47 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
class JFlexTask extends DefaultTask {
|
|
|
|
@InputFile
|
|
|
|
File jflexFile
|
|
|
|
|
|
|
|
@InputFile
|
|
|
|
File skeleton
|
|
|
|
|
|
|
|
@Optional
|
|
|
|
String heapSize
|
|
|
|
|
|
|
|
@OutputFile
|
|
|
|
File getGeneratedFile() {
|
|
|
|
return project.file(jflexFile.absolutePath.replace(".jflex", ".java"))
|
|
|
|
}
|
|
|
|
|
|
|
|
JFlexTask() {
|
|
|
|
dependsOn(project.rootProject.configurations.jflex)
|
|
|
|
}
|
|
|
|
|
|
|
|
@TaskAction
|
|
|
|
def generate() {
|
|
|
|
if (!jflexFile || !jflexFile.exists()) {
|
|
|
|
throw new GradleException("JFlex file does not exist: ${jflexFile}")
|
|
|
|
}
|
|
|
|
|
|
|
|
def target = project.file(jflexFile.absolutePath.replace(".jflex", ".java"))
|
|
|
|
|
|
|
|
logger.lifecycle("Recompiling JFlex: ${project.rootDir.relativePath(jflexFile)}")
|
|
|
|
|
|
|
|
project.javaexec {
|
|
|
|
classpath {
|
|
|
|
project.rootProject.configurations.jflex
|
|
|
|
}
|
|
|
|
|
|
|
|
main = "jflex.Main"
|
|
|
|
args += [
|
|
|
|
"-nobak",
|
|
|
|
"--quiet",
|
|
|
|
"--encoding", "UTF-8",
|
|
|
|
]
|
|
|
|
|
|
|
|
if (heapSize) {
|
|
|
|
maxHeapSize = heapSize
|
|
|
|
}
|
|
|
|
|
|
|
|
if (skeleton) {
|
|
|
|
args += ["--skel", skeleton.absolutePath]
|
|
|
|
}
|
|
|
|
|
|
|
|
args += [
|
|
|
|
"-d", target.parentFile.absolutePath,
|
|
|
|
jflexFile
|
|
|
|
]
|
|
|
|
}
|
|
|
|
|
|
|
|
// Correct line endings for Windows.
|
|
|
|
project.ant.fixcrlf(
|
|
|
|
file: target,
|
|
|
|
encoding: "UTF-8",
|
|
|
|
eol: "lf"
|
|
|
|
)
|
|
|
|
}
|
2020-01-27 06:36:13 -05:00
|
|
|
}
|