lucene/gradle/generation/jflex.gradle
Robert Muir b0bd64c620
LUCENE-9924: generate TLD list from IANA TLD db, rather than root zone db (#77)
This adds a bit of simplicity as the file is a simple domain list,
rather than a DNS zone. So the regexes parsing DNS can be removed.

Also the file may change less often as it contains JUST the list of
TLDs, and not any additional DNS metadata.
2021-04-11 11:25:15 -04:00

227 lines
6.8 KiB
Groovy

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Add a top-level pseudo-task to which we will attach individual regenerate tasks.
configure(rootProject) {
configurations {
jflex
}
dependencies {
jflex "de.jflex:jflex:${scriptDepVersions['jflex']}"
}
}
def resources = scriptResources(buildscript)
def skeletonDefault = file("${resources}/skeleton.default.txt")
def skeletonNoBufferExpansion = file("${resources}/skeleton.disable.buffer.expansion.txt")
configure(project(":lucene:core")) {
task generateStandardTokenizer(type: JFlexTask) {
description "Regenerate StandardTokenizerImpl.java"
group "generation"
jflexFile = file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex')
skeleton = skeletonNoBufferExpansion
doLast {
ant.replace(
file: file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java'),
encoding: "UTF-8",
token: "private static final int ZZ_BUFFERSIZE =",
value: "private int ZZ_BUFFERSIZE ="
)
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateStandardTokenizer, [ andThenTasks: "spotlessApply" ])
}
configure(project(":lucene:analysis:common")) {
task generateTlds() {
def tldZones = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
description "Regenerate top-level domain jflex macros and tests"
group "generation"
dependsOn { sourceSets.tools.runtimeClasspath }
outputs.files jflexMacro, tldList
doFirst {
project.javaexec {
main = "org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
classpath = sourceSets.tools.runtimeClasspath
ignoreExitValue false
args = [
tldZones,
jflexMacro,
tldList
]
}
logger.lifecycle("You've regenerated the TLD include file, remember to regenerate UAX29URLEmailTokenizerImpl too.")
}
}
task generateWikipediaTokenizer(type: JFlexTask) {
description "Regenerate WikipediaTokenizerImpl.java"
group "generation"
jflexFile = file('src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex')
skeleton = skeletonDefault
}
task generateClassicTokenizer(type: JFlexTask) {
description "Regenerate ClassicTokenizerImpl.java"
group "generation"
jflexFile = file('src/java/org/apache/lucene/analysis/classic/ClassicTokenizerImpl.jflex')
skeleton = skeletonDefault
}
task generateUAX29URLEmailTokenizer(type: JFlexTask) {
description "Regenerate UAX29URLEmailTokenizerImpl.java"
group "generation"
jflexFile = file('src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex')
skeleton = skeletonNoBufferExpansion
heapSize = "12g"
// Don't enforce strict dependency (although it would make sense to regenerate both).
// Just ensure we run after TLD regeneration, if it runs too.
mustRunAfter generateTlds
// Additional input (included by the source jflex file)
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
doFirst {
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
}
doLast {
ant.replace(
file: file('src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java'),
encoding: "UTF-8",
token: "private static final int ZZ_BUFFERSIZE =",
value: "private int ZZ_BUFFERSIZE ="
)
}
}
task generateHTMLStripCharFilter(type: JFlexTask) {
description "Regenerate HTMLStripCharFilter.java"
group "generation"
jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
skeleton = skeletonDefault
doFirst {
// Regenerate HTMLCharacterEntities.jflex first.
def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
quietExec {
executable = project.externalTool("python3")
workingDir = target.parentFile
args += [
"-B", // don't write any bytecode cache
"htmlentity.py"
]
}
project.ant.fixcrlf(
file: target,
encoding: "UTF-8",
eol: "lf"
)
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizer, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateClassicTokenizer, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizer, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateTlds, [ andThenTasks: "spotlessApply" ])
}
class JFlexTask extends DefaultTask {
@InputFile
File jflexFile
@InputFile
File skeleton
@Optional
String heapSize
@OutputFile
File getGeneratedFile() {
return project.file(jflexFile.absolutePath.replace(".jflex", ".java"))
}
JFlexTask() {
dependsOn(project.rootProject.configurations.jflex)
}
@TaskAction
def generate() {
if (!jflexFile || !jflexFile.exists()) {
throw new GradleException("JFlex file does not exist: ${jflexFile}")
}
def target = project.file(jflexFile.absolutePath.replace(".jflex", ".java"))
logger.lifecycle("Recompiling JFlex: ${project.rootDir.relativePath(jflexFile)}")
project.javaexec {
classpath {
project.rootProject.configurations.jflex
}
main = "jflex.Main"
args += [
"-nobak",
"--quiet",
"--encoding", "UTF-8",
]
if (heapSize) {
maxHeapSize = heapSize
}
if (skeleton) {
args += ["--skel", skeleton.absolutePath]
}
args += [
"-d", target.parentFile.absolutePath,
jflexFile
]
}
// Correct line endings for Windows.
project.ant.fixcrlf(
file: target,
encoding: "UTF-8",
eol: "lf"
)
}
}