LUCENE-9904: Port GenerateJflexTLDMacros.java regeneration to gradle and regenerate UAX tokenizer with up-to-date TLDs

This commit is contained in:
Dawid Weiss 2021-04-07 10:56:21 +02:00 committed by GitHub
parent efeea0b8ee
commit 39071dbc54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 40225 additions and 41424 deletions

View File

@ -53,6 +53,35 @@ configure(project(":lucene:core")) {
} }
configure(project(":lucene:analysis:common")) { configure(project(":lucene:analysis:common")) {
task generateTlds() {
def tldZones = "https://www.internic.net/zones/root.zone"
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
description "Regenerate top-level domain jflex macros and tests"
group "generation"
dependsOn { sourceSets.tools.runtimeClasspath }
outputs.files jflexMacro, tldList
doFirst {
project.javaexec {
main = "org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
classpath = sourceSets.tools.runtimeClasspath
ignoreExitValue false
args = [
tldZones,
jflexMacro,
tldList
]
}
logger.lifecycle("You've regenerated the TLD include file, remember to regenerate UAX29URLEmailTokenizerImpl too.")
}
}
task generateWikipediaTokenizer(type: JFlexTask) { task generateWikipediaTokenizer(type: JFlexTask) {
description "Regenerate WikipediaTokenizerImpl.java" description "Regenerate WikipediaTokenizerImpl.java"
group "generation" group "generation"
@ -77,6 +106,13 @@ configure(project(":lucene:analysis:common")) {
skeleton = skeletonNoBufferExpansion skeleton = skeletonNoBufferExpansion
heapSize = "12g" heapSize = "12g"
// Don't enforce strict dependency (although it would make sense to regenerate both).
// Just ensure we run after TLD regeneration, if it runs too.
mustRunAfter generateTlds
// Additional input (included by the source jflex file)
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
doFirst { doFirst {
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).") logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
} }
@ -121,7 +157,8 @@ configure(project(":lucene:analysis:common")) {
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizer, [ andThenTasks: "spotlessApply" ]), regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizer, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateClassicTokenizer, [ andThenTasks: "spotlessApply" ]), wrapWithPersistentChecksums(generateClassicTokenizer, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizer, [ andThenTasks: "spotlessApply" ]), wrapWithPersistentChecksums(generateUAX29URLEmailTokenizer, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ]) wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ]),
wrapWithPersistentChecksums(generateTlds, [ andThenTasks: "spotlessApply" ])
} }
class JFlexTask extends DefaultTask { class JFlexTask extends DefaultTask {

View File

@ -234,6 +234,9 @@ Bug fixes
Changes in Backwards Compatibility Policy Changes in Backwards Compatibility Policy
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
level domains. This may change the token sequence compared to previous Lucene versions. (Dawid Weiss)
* LUCENE-9669: DirectoryReader#open now accepts an argument to open indices created with versions * LUCENE-9669: DirectoryReader#open now accepts an argument to open indices created with versions
older than N-1. Lucene now can open indices created with a major version of N-2 in read-only mode. older than N-1. Lucene now can open indices created with a major version of N-2 in read-only mode.
Opening an index created with a major version of N-2 with an IndexWriter is not supported. Opening an index created with a major version of N-2 with an IndexWriter is not supported.

View File

@ -0,0 +1,4 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "29b05df80da071a249ff68067d5833fd7e4a9ff4",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "b346a80f511b64a59d556eb3ef58cf396e98c631"
}

View File

@ -1,5 +1,6 @@
{ {
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "472a33bcf5741bc8923aaa2717000d9ccd62f1e2", "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2",
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908", "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ebeb76148ee02612841c463819f86d8932c9a7c3" "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "29b05df80da071a249ff68067d5833fd7e4a9ff4",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "e437900d9570ca007f9c02c9ea286222b644c329"
} }

File diff suppressed because it is too large Load Diff

View File

@ -132,7 +132,7 @@ ComplexContextEx = \p{LB:Complex_Context}
// RFC-5321: Simple Mail Transfer Protocol // RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format // RFC-5322: Internet Message Format
%include ASCIITLD.jflex-macro %include ASCIITLD.jflex
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])? DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainLabelSequence = {DomainLabel} ("." {DomainLabel})* DomainLabelSequence = {DomainLabel} ("." {DomainLabel})*

File diff suppressed because it is too large Load Diff

View File

@ -40,6 +40,7 @@ import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
/** /**
* Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for * Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
@ -53,16 +54,18 @@ import java.util.regex.Pattern;
public class GenerateJflexTLDMacros { public class GenerateJflexTLDMacros {
public static void main(String... args) throws Exception { public static void main(String... args) throws Exception {
if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) { if (args.length != 3 || args[0].equals("--help") || args[0].equals("-help")) {
System.err.println("Cmd line params:"); System.err.println("Cmd line params:");
System.err.println( System.err.println(
"\tjava " + GenerateJflexTLDMacros.class.getName() + "<ZoneFileURL> <JFlexOutputFile>"); " java "
+ GenerateJflexTLDMacros.class.getName()
+ "<ZoneFileURL> <JFlexOutputFile> <TLDListFile>");
System.exit(1); System.exit(1);
} }
new GenerateJflexTLDMacros(args[0], args[1]).execute(); new GenerateJflexTLDMacros(args[0], args[1], args[2]).execute();
} }
private static final String NL = System.getProperty("line.separator"); private static final String NL = "\n";
private static final String APACHE_LICENSE = private static final String APACHE_LICENSE =
"/*" "/*"
@ -103,16 +106,21 @@ public class GenerateJflexTLDMacros {
Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*"); Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
private final URL tldFileURL; private final URL tldFileURL;
private long tldFileLastModified = -1L; private long tldFileLastModified = -1L;
private final Path outputFile; private final Path tldListFile;
private final Path jflexMacroFile;
private final SortedMap<String, Boolean> processedTLDsLongestFirst = private final SortedMap<String, Boolean> processedTLDsLongestFirst =
new TreeMap<>( new TreeMap<>(
Comparator.comparing(String::length).reversed().thenComparing(String::compareTo)); Comparator.comparing(String::length).reversed().thenComparing(String::compareTo));
private final List<SortedSet<String>> TLDsBySuffixLength =
new ArrayList<>(); // list position indicates suffix length
public GenerateJflexTLDMacros(String tldFileURL, String outputFile) throws Exception { // list position indicates suffix length
private final List<SortedSet<String>> TLDsBySuffixLength = new ArrayList<>();
public GenerateJflexTLDMacros(String tldFileURL, String jflexFile, String tldListFile)
throws Exception {
this.tldFileURL = new URL(tldFileURL); this.tldFileURL = new URL(tldFileURL);
this.outputFile = Paths.get(outputFile); this.jflexMacroFile = Paths.get(jflexFile);
this.tldListFile = Paths.get(tldListFile);
} }
/** /**
@ -126,7 +134,7 @@ public class GenerateJflexTLDMacros {
getIANARootZoneDatabase(); getIANARootZoneDatabase();
partitionTLDprefixesBySuffixLength(); partitionTLDprefixesBySuffixLength();
writeOutput(); writeOutput();
System.out.println("Wrote TLD macros to '" + outputFile + "':"); System.out.println("Wrote TLD macros to '" + jflexMacroFile + "':");
int totalDomains = 0; int totalDomains = 0;
for (int suffixLength = 0; suffixLength < TLDsBySuffixLength.size(); ++suffixLength) { for (int suffixLength = 0; suffixLength < TLDsBySuffixLength.size(); ++suffixLength) {
int domainsAtThisSuffixLength = TLDsBySuffixLength.get(suffixLength).size(); int domainsAtThisSuffixLength = TLDsBySuffixLength.get(suffixLength).size();
@ -214,11 +222,19 @@ public class GenerateJflexTLDMacros {
* case-insensitively. * case-insensitively.
*/ */
private void writeOutput() throws IOException { private void writeOutput() throws IOException {
Files.writeString(
tldListFile,
"# Generated from IANA Root Zone Database (gradlew generateTlds)."
+ processedTLDsLongestFirst.keySet().stream()
.sorted()
.collect(Collectors.joining("\n")),
StandardCharsets.UTF_8);
final DateFormat dateFormat = final DateFormat dateFormat =
DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.FULL, Locale.ROOT); DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
try (Writer writer = try (Writer writer =
new OutputStreamWriter(Files.newOutputStream(outputFile), StandardCharsets.UTF_8)) { new OutputStreamWriter(Files.newOutputStream(jflexMacroFile), StandardCharsets.UTF_8)) {
writer.write(APACHE_LICENSE); writer.write(APACHE_LICENSE);
writer.write("// Generated from IANA Root Zone Database <"); writer.write("// Generated from IANA Root Zone Database <");
writer.write(tldFileURL.toString()); writer.write(tldFileURL.toString());
@ -270,7 +286,7 @@ public class GenerateJflexTLDMacros {
boolean isFirst = true; boolean isFirst = true;
for (String TLD : TLDs) { for (String TLD : TLDs) {
writer.write("\t"); writer.write(" ");
if (isFirst) { if (isFirst) {
isFirst = false; isFirst = false;
writer.write(" "); writer.write(" ");
@ -280,7 +296,7 @@ public class GenerateJflexTLDMacros {
writer.write(getCaseInsensitiveRegex(TLD)); writer.write(getCaseInsensitiveRegex(TLD));
writer.write(NL); writer.write(NL);
} }
writer.write("\t) \".\"? // Accept trailing root (empty) domain"); writer.write(" ) \".\"? // Accept trailing root (empty) domain");
writer.write(NL); writer.write(NL);
writer.write(NL); writer.write(NL);
} }