LUCENE-9909: add checksums of included files for some jflex generation tasks. Fix a task ordering issue with spotless. (#121)

* LUCENE-9909: Some jflex regeneration tasks should have proper dependencies and also check the checksums of included files.

* Force a dependency on low-level spotless tasks so that they're always properly ordered (hell!). Update ASCIITLD and regenerate the remaining code. Add cross-dependencies between generation tasks that take includes as input.
This commit is contained in:
Dawid Weiss 2021-05-02 19:17:18 +02:00 committed by GitHub
parent 06907a2c12
commit 8eb4eb2611
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 34975 additions and 35079 deletions

View File

@ -269,7 +269,7 @@ configure(project(":lucene:analysis:common")) {
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodePropsInternal, [ andThenTasks: "spotlessApply" ])
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodePropsInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
}

View File

@ -252,9 +252,9 @@ configure(project(":lucene:queryparser")) {
description "Regenerate query parsers (javacc syntax definitions)."
group "generation"
dependsOn wrapWithPersistentChecksums(javaccParserClassicInternal, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(javaccParserSurroundInternal, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(javaccParserFlexibleInternal, [ andThenTasks: "spotlessApply" ])
dependsOn wrapWithPersistentChecksums(javaccParserClassicInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
wrapWithPersistentChecksums(javaccParserSurroundInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
wrapWithPersistentChecksums(javaccParserFlexibleInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
}
regenerate.dependsOn javacc

View File

@ -41,6 +41,9 @@ configure(project(":lucene:core")) {
jflexFile = file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex')
skeleton = skeletonNoBufferExpansion
// Add included files as inputs.
inputs.file project(":lucene:core").file('src/data/jflex/UnicodeEmojiProperties.jflex')
doLast {
ant.replace(
file: file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java'),
@ -51,7 +54,13 @@ configure(project(":lucene:core")) {
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateStandardTokenizerInternal, [ andThenTasks: "spotlessApply" ])
def generateStandardTokenizer = wrapWithPersistentChecksums(generateStandardTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
configure(generateStandardTokenizer) {
// StandardTokenizerImpl.jflex includes UnicodeEmojiProperties.jflex so we make sure it's up to date.
dependsOn ":lucene:core:generateEmojiProperties"
}
regenerate.dependsOn generateStandardTokenizer
}
configure(project(":lucene:analysis:common")) {
@ -130,12 +139,9 @@ configure(project(":lucene:analysis:common")) {
skeleton = skeletonNoBufferExpansion
heapSize = "12g"
// Don't enforce strict dependency (although it would make sense to regenerate both).
// Just ensure we run after TLD regeneration, if it runs too.
mustRunAfter generateTldsInternal
// Additional input (included by the source jflex file)
// Add included files as inputs.
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
inputs.file project(":lucene:core").file('src/data/jflex/UnicodeEmojiProperties.jflex')
doFirst {
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
@ -151,22 +157,28 @@ configure(project(":lucene:analysis:common")) {
}
}
task generateHTMLStripCharFilterInternal(type: JFlexTask) {
description "Regenerate HTMLStripCharFilter.java"
group "generation"
def generateUAX29URLEmailTokenizer = wrapWithPersistentChecksums(generateUAX29URLEmailTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
configure (generateUAX29URLEmailTokenizer) {
// UAX29URLEmailTokenizerImpl.jflex includes: UnicodeEmojiProperties.jflex and ASCIITLD.jflex
// so we make sure both are up to date.
dependsOn ":lucene:core:generateEmojiProperties", "generateTlds"
}
jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
skeleton = skeletonDefault
task generateHTMLCharacterEntitiesInternal() {
def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
def script = file("${resources}/htmlentity.py")
outputs.files target
inputs.file script
doFirst {
// Regenerate HTMLCharacterEntities.jflex first.
def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
quietExec {
executable = project.externalTool("python3")
workingDir = target.parentFile
args += [
"-B", // don't write any bytecode cache
"htmlentity.py"
script,
target
]
}
@ -178,11 +190,29 @@ configure(project(":lucene:analysis:common")) {
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizerInternal, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateClassicTokenizerInternal, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizerInternal, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateHTMLStripCharFilterInternal, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateTldsInternal, [ andThenTasks: "spotlessApply" ])
task generateHTMLStripCharFilterInternal(type: JFlexTask) {
description "Regenerate HTMLStripCharFilter.java"
group "generation"
// Add included files as inputs.
inputs.file file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
skeleton = skeletonDefault
}
def generateHTMLStripCharFilter = wrapWithPersistentChecksums(generateHTMLStripCharFilterInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
configure(generateHTMLStripCharFilter) {
// HTMLStripCharFilter.jflex includes HTMLCharacterEntities.jflex so we make sure it's up to date.
dependsOn "generateHTMLCharacterEntities"
}
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
wrapWithPersistentChecksums(generateClassicTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
generateUAX29URLEmailTokenizer,
wrapWithPersistentChecksums(generateHTMLCharacterEntitiesInternal),
generateHTMLStripCharFilter,
wrapWithPersistentChecksums(generateTldsInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
}
class JFlexTask extends DefaultTask {

View File

@ -14,75 +14,79 @@
# limitations under the License.
import re
import sys
# A simple python script to generate an HTML entity map and a regex alternation
# for inclusion in HTMLStripCharFilter.jflex.
def main():
print(get_apache_license())
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
print(output_line, ')')
print('%{')
print(' private static final Map<String,String> upperCaseVariantsAccepted')
print(' = new HashMap<>();')
print(' static {')
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
print(' upperCaseVariantsAccepted.put("gt", "GT");')
print(' upperCaseVariantsAccepted.put("lt", "LT");')
print(' upperCaseVariantsAccepted.put("reg", "REG");')
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
print(' }')
print(' private static final CharArrayMap<Character> entityValues')
print(' = new CharArrayMap<>(%i, false);' % len(keys))
print(' static {')
print(' String[] entities = {')
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print(output_line)
with open(sys.argv[1], 'w') as f:
sys.stdout = f
print(get_apache_license())
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
match = regex.match(line)
if match:
key = match.group(1)
if key == 'quot': codes[key] = r'\"'
elif key == 'nbsp': codes[key] = ' ';
else : codes[key] = r'\u%04X' % int(match.group(2))
keys = sorted(codes)
first_entry = True
output_line = 'CharacterEntities = ( '
for key in keys:
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
print(output_line, ')')
print('%{')
print(' private static final Map<String,String> upperCaseVariantsAccepted')
print(' = new HashMap<>();')
print(' static {')
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
print(' upperCaseVariantsAccepted.put("gt", "GT");')
print(' upperCaseVariantsAccepted.put("lt", "LT");')
print(' upperCaseVariantsAccepted.put("reg", "REG");')
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
print(' }')
print(' private static final CharArrayMap<Character> entityValues')
print(' = new CharArrayMap<>(%i, false);' % len(keys))
print(' static {')
print(' String[] entities = {')
output_line = ' '
output_line += new_entry
print(output_line[:-1])
print(' };')
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
print(' Character value = entities[i + 1].charAt(0);')
print(' entityValues.put(entities[i], value);')
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
print(' if (upperCaseVariant != null) {')
print(' entityValues.put(upperCaseVariant, value);')
print(' }')
print(' }')
print(" }")
print("%}")
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print(output_line)
output_line = ' '
output_line += new_entry
print(output_line[:-1])
print(' };')
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
print(' Character value = entities[i + 1].charAt(0);')
print(' entityValues.put(entities[i], value);')
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
print(' if (upperCaseVariant != null) {')
print(' entityValues.put(upperCaseVariant, value);')
print(' }')
print(' }')
print(" }")
print("%}")
def get_entity_text():
# The text below is taken verbatim from

View File

@ -99,8 +99,8 @@ configure(project(":lucene:core")) {
description "Regenerate Moman-based sources."
group "generation"
dependsOn wrapWithPersistentChecksums(utilGenPackedInternal, [ andThenTasks: "spotlessApply" ])
dependsOn wrapWithPersistentChecksums(utilGenLevInternal, [ andThenTasks: "spotlessApply" ])
dependsOn wrapWithPersistentChecksums(utilGenPackedInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
dependsOn wrapWithPersistentChecksums(utilGenLevInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
}
regenerate.dependsOn moman

View File

@ -133,5 +133,5 @@ configure(project(":lucene:analysis:common")) {
}
}
regenerate.dependsOn wrapWithPersistentChecksums(snowballInternal, [ andThenTasks: "spotlessApply", ignoreWithSource: [downloadSnowballStemmers, downloadSnowballWebsite, downloadSnowballData] ])
regenerate.dependsOn wrapWithPersistentChecksums(snowballInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"], ignoreWithSource: [downloadSnowballStemmers, downloadSnowballWebsite, downloadSnowballData] ])
}

View File

@ -0,0 +1,4 @@
{
"gradle/generation/jflex/htmlentity.py": "66e0238fed4e069addbbdb93179b1457fc3a5203",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d"
}

View File

@ -1,5 +1,6 @@
{
"gradle/generation/jflex/skeleton.default.txt": "ca1043249c0eefdf2623a785e2b91f5608bfc3f1",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "78f5208455706d60a9ce4b63624ed04b0fd32573",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "71760e2f7abe078109545a0c68aeac9125508d7c"
}

View File

@ -1,5 +1,5 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "aae1ea12f09aa2efcf7611df2dd11cda32869cda",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "54d9a32e6dbac42aee8b3aa0d1133ed2fb5f5259",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "bb3878ea10f85f124a0a9e4ea614d3400d664dae",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "800f0f70d7ce4723d493cd5da88a19bfc8e532e5",
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
}

View File

@ -1,6 +1,7 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "aae1ea12f09aa2efcf7611df2dd11cda32869cda",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "d890462c065c2b66ce0e58d95fae64ecb64049d2",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2"
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "bb3878ea10f85f124a0a9e4ea614d3400d664dae",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "2bf3efe1a1bc473eb3fe2456f50521ecd7d9b03b",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2",
"lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae"
}

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
// file version from 2021 Apr 18, Sun 07:07:01 Coordinated Universal Time
// file version from 2021 May 1, Sat 07:07:01 Coordinated Universal Time
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
@ -589,7 +589,6 @@ ASCIITLD = "." (
| [iI][sS][tT][aA][nN][bB][uU][lL]
| [iI][tT][aA][uU]
| [iI][tT][vV]
| [iI][vV][eE][cC][oO]
| [jJ][aA][gG][uU][aA][rR]
| [jJ][aA][vV][aA]
| [jJ][cC][bB]
@ -1039,7 +1038,6 @@ ASCIITLD = "." (
| [sS][pP][aA][cC][eE]
| [sS][pP][oO][rR][tT]
| [sS][pP][oO][tT]
| [sS][pP][rR][eE][aA][dD][bB][eE][tT][tT][iI][nN][gG]
| [sS][rR][lL]
| [sS][sS]
| [sS][tT][aA][dD][aA]

View File

@ -624,7 +624,6 @@ istanbul
it
itau
itv
iveco
jaguar
java
jcb
@ -1122,7 +1121,6 @@ spa
space
sport
spot
spreadbetting
sr
srl
ss

View File

@ -1,5 +1,6 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
"lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "8e33c2698446c1c7a9479796a41316d1932ceda9",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "6158aeb8dd11cd9100623608b2dcce51b2df9d0b"
}