mirror of https://github.com/apache/lucene.git
LUCENE-9909: add checksums of included files for some jflex generation tasks. Fix a task ordering issue with spotless. (#121)
* LUCENE-9909: Some jflex regeneration tasks should have proper dependencies and also check the checksums of included files. * Force a dependency on low-level spotless tasks so that they're always properly ordered (hell!). Update ASCIITLD and regenerate the remaining code. Add cross-dependencies between generation tasks that take includes as input.
This commit is contained in:
parent
06907a2c12
commit
8eb4eb2611
|
@ -269,7 +269,7 @@ configure(project(":lucene:analysis:common")) {
|
|||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodePropsInternal, [ andThenTasks: "spotlessApply" ])
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodePropsInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -252,9 +252,9 @@ configure(project(":lucene:queryparser")) {
|
|||
description "Regenerate query parsers (javacc syntax definitions)."
|
||||
group "generation"
|
||||
|
||||
dependsOn wrapWithPersistentChecksums(javaccParserClassicInternal, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(javaccParserSurroundInternal, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(javaccParserFlexibleInternal, [ andThenTasks: "spotlessApply" ])
|
||||
dependsOn wrapWithPersistentChecksums(javaccParserClassicInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
|
||||
wrapWithPersistentChecksums(javaccParserSurroundInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
|
||||
wrapWithPersistentChecksums(javaccParserFlexibleInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
}
|
||||
|
||||
regenerate.dependsOn javacc
|
||||
|
|
|
@ -41,6 +41,9 @@ configure(project(":lucene:core")) {
|
|||
jflexFile = file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex')
|
||||
skeleton = skeletonNoBufferExpansion
|
||||
|
||||
// Add included files as inputs.
|
||||
inputs.file project(":lucene:core").file('src/data/jflex/UnicodeEmojiProperties.jflex')
|
||||
|
||||
doLast {
|
||||
ant.replace(
|
||||
file: file('src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java'),
|
||||
|
@ -51,7 +54,13 @@ configure(project(":lucene:core")) {
|
|||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateStandardTokenizerInternal, [ andThenTasks: "spotlessApply" ])
|
||||
def generateStandardTokenizer = wrapWithPersistentChecksums(generateStandardTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
configure(generateStandardTokenizer) {
|
||||
// StandardTokenizerImpl.jflex includes UnicodeEmojiProperties.jflex so we make sure it's up to date.
|
||||
dependsOn ":lucene:core:generateEmojiProperties"
|
||||
}
|
||||
|
||||
regenerate.dependsOn generateStandardTokenizer
|
||||
}
|
||||
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
|
@ -130,12 +139,9 @@ configure(project(":lucene:analysis:common")) {
|
|||
skeleton = skeletonNoBufferExpansion
|
||||
heapSize = "12g"
|
||||
|
||||
// Don't enforce strict dependency (although it would make sense to regenerate both).
|
||||
// Just ensure we run after TLD regeneration, if it runs too.
|
||||
mustRunAfter generateTldsInternal
|
||||
|
||||
// Additional input (included by the source jflex file)
|
||||
// Add included files as inputs.
|
||||
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
|
||||
inputs.file project(":lucene:core").file('src/data/jflex/UnicodeEmojiProperties.jflex')
|
||||
|
||||
doFirst {
|
||||
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
|
||||
|
@ -151,22 +157,28 @@ configure(project(":lucene:analysis:common")) {
|
|||
}
|
||||
}
|
||||
|
||||
task generateHTMLStripCharFilterInternal(type: JFlexTask) {
|
||||
description "Regenerate HTMLStripCharFilter.java"
|
||||
group "generation"
|
||||
def generateUAX29URLEmailTokenizer = wrapWithPersistentChecksums(generateUAX29URLEmailTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
configure (generateUAX29URLEmailTokenizer) {
|
||||
// UAX29URLEmailTokenizerImpl.jflex includes: UnicodeEmojiProperties.jflex and ASCIITLD.jflex
|
||||
// so we make sure both are up to date.
|
||||
dependsOn ":lucene:core:generateEmojiProperties", "generateTlds"
|
||||
}
|
||||
|
||||
jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
|
||||
skeleton = skeletonDefault
|
||||
task generateHTMLCharacterEntitiesInternal() {
|
||||
def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
|
||||
def script = file("${resources}/htmlentity.py")
|
||||
|
||||
outputs.files target
|
||||
inputs.file script
|
||||
|
||||
doFirst {
|
||||
// Regenerate HTMLCharacterEntities.jflex first.
|
||||
def target = file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
|
||||
quietExec {
|
||||
executable = project.externalTool("python3")
|
||||
workingDir = target.parentFile
|
||||
args += [
|
||||
"-B", // don't write any bytecode cache
|
||||
"htmlentity.py"
|
||||
script,
|
||||
target
|
||||
]
|
||||
}
|
||||
|
||||
|
@ -178,11 +190,29 @@ configure(project(":lucene:analysis:common")) {
|
|||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizerInternal, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateClassicTokenizerInternal, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizerInternal, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateHTMLStripCharFilterInternal, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateTldsInternal, [ andThenTasks: "spotlessApply" ])
|
||||
task generateHTMLStripCharFilterInternal(type: JFlexTask) {
|
||||
description "Regenerate HTMLStripCharFilter.java"
|
||||
group "generation"
|
||||
|
||||
// Add included files as inputs.
|
||||
inputs.file file('src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex')
|
||||
|
||||
jflexFile = file('src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex')
|
||||
skeleton = skeletonDefault
|
||||
}
|
||||
|
||||
def generateHTMLStripCharFilter = wrapWithPersistentChecksums(generateHTMLStripCharFilterInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
configure(generateHTMLStripCharFilter) {
|
||||
// HTMLStripCharFilter.jflex includes HTMLCharacterEntities.jflex so we make sure it's up to date.
|
||||
dependsOn "generateHTMLCharacterEntities"
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
|
||||
wrapWithPersistentChecksums(generateClassicTokenizerInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]),
|
||||
generateUAX29URLEmailTokenizer,
|
||||
wrapWithPersistentChecksums(generateHTMLCharacterEntitiesInternal),
|
||||
generateHTMLStripCharFilter,
|
||||
wrapWithPersistentChecksums(generateTldsInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
}
|
||||
|
||||
class JFlexTask extends DefaultTask {
|
||||
|
|
|
@ -14,75 +14,79 @@
|
|||
# limitations under the License.
|
||||
|
||||
import re
|
||||
import sys
|
||||
|
||||
# A simple python script to generate an HTML entity map and a regex alternation
|
||||
# for inclusion in HTMLStripCharFilter.jflex.
|
||||
|
||||
def main():
|
||||
print(get_apache_license())
|
||||
codes = {}
|
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
|
||||
for line in get_entity_text().split('\n'):
|
||||
match = regex.match(line)
|
||||
if match:
|
||||
key = match.group(1)
|
||||
if key == 'quot': codes[key] = r'\"'
|
||||
elif key == 'nbsp': codes[key] = ' ';
|
||||
else : codes[key] = r'\u%04X' % int(match.group(2))
|
||||
|
||||
keys = sorted(codes)
|
||||
|
||||
first_entry = True
|
||||
output_line = 'CharacterEntities = ( '
|
||||
for key in keys:
|
||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
|
||||
first_entry = False
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
if key in ('quot','copy','gt','lt','reg','amp'):
|
||||
new_entry = ' | "%s"' % key.upper()
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print(output_line, ')')
|
||||
|
||||
print('%{')
|
||||
print(' private static final Map<String,String> upperCaseVariantsAccepted')
|
||||
print(' = new HashMap<>();')
|
||||
print(' static {')
|
||||
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
|
||||
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
|
||||
print(' upperCaseVariantsAccepted.put("gt", "GT");')
|
||||
print(' upperCaseVariantsAccepted.put("lt", "LT");')
|
||||
print(' upperCaseVariantsAccepted.put("reg", "REG");')
|
||||
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
|
||||
print(' }')
|
||||
print(' private static final CharArrayMap<Character> entityValues')
|
||||
print(' = new CharArrayMap<>(%i, false);' % len(keys))
|
||||
print(' static {')
|
||||
print(' String[] entities = {')
|
||||
output_line = ' '
|
||||
for key in keys:
|
||||
new_entry = ' "%s", "%s",' % (key, codes[key])
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print(output_line)
|
||||
with open(sys.argv[1], 'w') as f:
|
||||
sys.stdout = f
|
||||
|
||||
print(get_apache_license())
|
||||
codes = {}
|
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
|
||||
for line in get_entity_text().split('\n'):
|
||||
match = regex.match(line)
|
||||
if match:
|
||||
key = match.group(1)
|
||||
if key == 'quot': codes[key] = r'\"'
|
||||
elif key == 'nbsp': codes[key] = ' ';
|
||||
else : codes[key] = r'\u%04X' % int(match.group(2))
|
||||
|
||||
keys = sorted(codes)
|
||||
|
||||
first_entry = True
|
||||
output_line = 'CharacterEntities = ( '
|
||||
for key in keys:
|
||||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
|
||||
first_entry = False
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
if key in ('quot','copy','gt','lt','reg','amp'):
|
||||
new_entry = ' | "%s"' % key.upper()
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print(output_line, ')')
|
||||
|
||||
print('%{')
|
||||
print(' private static final Map<String,String> upperCaseVariantsAccepted')
|
||||
print(' = new HashMap<>();')
|
||||
print(' static {')
|
||||
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
|
||||
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
|
||||
print(' upperCaseVariantsAccepted.put("gt", "GT");')
|
||||
print(' upperCaseVariantsAccepted.put("lt", "LT");')
|
||||
print(' upperCaseVariantsAccepted.put("reg", "REG");')
|
||||
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
|
||||
print(' }')
|
||||
print(' private static final CharArrayMap<Character> entityValues')
|
||||
print(' = new CharArrayMap<>(%i, false);' % len(keys))
|
||||
print(' static {')
|
||||
print(' String[] entities = {')
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print(output_line[:-1])
|
||||
print(' };')
|
||||
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
|
||||
print(' Character value = entities[i + 1].charAt(0);')
|
||||
print(' entityValues.put(entities[i], value);')
|
||||
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
|
||||
print(' if (upperCaseVariant != null) {')
|
||||
print(' entityValues.put(upperCaseVariant, value);')
|
||||
print(' }')
|
||||
print(' }')
|
||||
print(" }")
|
||||
print("%}")
|
||||
for key in keys:
|
||||
new_entry = ' "%s", "%s",' % (key, codes[key])
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print(output_line[:-1])
|
||||
print(' };')
|
||||
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
|
||||
print(' Character value = entities[i + 1].charAt(0);')
|
||||
print(' entityValues.put(entities[i], value);')
|
||||
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
|
||||
print(' if (upperCaseVariant != null) {')
|
||||
print(' entityValues.put(upperCaseVariant, value);')
|
||||
print(' }')
|
||||
print(' }')
|
||||
print(" }")
|
||||
print("%}")
|
||||
|
||||
def get_entity_text():
|
||||
# The text below is taken verbatim from
|
|
@ -99,8 +99,8 @@ configure(project(":lucene:core")) {
|
|||
description "Regenerate Moman-based sources."
|
||||
group "generation"
|
||||
|
||||
dependsOn wrapWithPersistentChecksums(utilGenPackedInternal, [ andThenTasks: "spotlessApply" ])
|
||||
dependsOn wrapWithPersistentChecksums(utilGenLevInternal, [ andThenTasks: "spotlessApply" ])
|
||||
dependsOn wrapWithPersistentChecksums(utilGenPackedInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
dependsOn wrapWithPersistentChecksums(utilGenLevInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
|
||||
}
|
||||
|
||||
regenerate.dependsOn moman
|
||||
|
|
|
@ -133,5 +133,5 @@ configure(project(":lucene:analysis:common")) {
|
|||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(snowballInternal, [ andThenTasks: "spotlessApply", ignoreWithSource: [downloadSnowballStemmers, downloadSnowballWebsite, downloadSnowballData] ])
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(snowballInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"], ignoreWithSource: [downloadSnowballStemmers, downloadSnowballWebsite, downloadSnowballData] ])
|
||||
}
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"gradle/generation/jflex/htmlentity.py": "66e0238fed4e069addbbdb93179b1457fc3a5203",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d"
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.default.txt": "ca1043249c0eefdf2623a785e2b91f5608bfc3f1",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex": "d1aa75b9b37646efe31731394f84a063eb7eed9d",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java": "78f5208455706d60a9ce4b63624ed04b0fd32573",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex": "71760e2f7abe078109545a0c68aeac9125508d7c"
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "aae1ea12f09aa2efcf7611df2dd11cda32869cda",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "54d9a32e6dbac42aee8b3aa0d1133ed2fb5f5259",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "bb3878ea10f85f124a0a9e4ea614d3400d664dae",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "800f0f70d7ce4723d493cd5da88a19bfc8e532e5",
|
||||
"property:tldZones": "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "aae1ea12f09aa2efcf7611df2dd11cda32869cda",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "d890462c065c2b66ce0e58d95fae64ecb64049d2",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2"
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "bb3878ea10f85f124a0a9e4ea614d3400d664dae",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "2bf3efe1a1bc473eb3fe2456f50521ecd7d9b03b",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2",
|
||||
"lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae"
|
||||
}
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
|
||||
// file version from 2021 Apr 18, Sun 07:07:01 Coordinated Universal Time
|
||||
// file version from 2021 May 1, Sat 07:07:01 Coordinated Universal Time
|
||||
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
|
||||
|
@ -589,7 +589,6 @@ ASCIITLD = "." (
|
|||
| [iI][sS][tT][aA][nN][bB][uU][lL]
|
||||
| [iI][tT][aA][uU]
|
||||
| [iI][tT][vV]
|
||||
| [iI][vV][eE][cC][oO]
|
||||
| [jJ][aA][gG][uU][aA][rR]
|
||||
| [jJ][aA][vV][aA]
|
||||
| [jJ][cC][bB]
|
||||
|
@ -1039,7 +1038,6 @@ ASCIITLD = "." (
|
|||
| [sS][pP][aA][cC][eE]
|
||||
| [sS][pP][oO][rR][tT]
|
||||
| [sS][pP][oO][tT]
|
||||
| [sS][pP][rR][eE][aA][dD][bB][eE][tT][tT][iI][nN][gG]
|
||||
| [sS][rR][lL]
|
||||
| [sS][sS]
|
||||
| [sS][tT][aA][dD][aA]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -624,7 +624,6 @@ istanbul
|
|||
it
|
||||
itau
|
||||
itv
|
||||
iveco
|
||||
jaguar
|
||||
java
|
||||
jcb
|
||||
|
@ -1122,7 +1121,6 @@ spa
|
|||
space
|
||||
sport
|
||||
spot
|
||||
spreadbetting
|
||||
sr
|
||||
srl
|
||||
ss
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
|
||||
"lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae",
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "8e33c2698446c1c7a9479796a41316d1932ceda9",
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "6158aeb8dd11cd9100623608b2dcce51b2df9d0b"
|
||||
}
|
Loading…
Reference in New Issue