mirror of https://github.com/apache/lucene.git
LUCENE-9904: Port GenerateJflexTLDMacros.java regeneration to gradle and regenerate UAX tokenizer with up-to-date TLDs
This commit is contained in:
parent
efeea0b8ee
commit
39071dbc54
|
@ -53,6 +53,35 @@ configure(project(":lucene:core")) {
|
||||||
}
|
}
|
||||||
|
|
||||||
configure(project(":lucene:analysis:common")) {
|
configure(project(":lucene:analysis:common")) {
|
||||||
|
task generateTlds() {
|
||||||
|
def tldZones = "https://www.internic.net/zones/root.zone"
|
||||||
|
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
|
||||||
|
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
|
||||||
|
|
||||||
|
description "Regenerate top-level domain jflex macros and tests"
|
||||||
|
group "generation"
|
||||||
|
|
||||||
|
dependsOn { sourceSets.tools.runtimeClasspath }
|
||||||
|
|
||||||
|
outputs.files jflexMacro, tldList
|
||||||
|
|
||||||
|
doFirst {
|
||||||
|
project.javaexec {
|
||||||
|
main = "org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
||||||
|
classpath = sourceSets.tools.runtimeClasspath
|
||||||
|
|
||||||
|
ignoreExitValue false
|
||||||
|
args = [
|
||||||
|
tldZones,
|
||||||
|
jflexMacro,
|
||||||
|
tldList
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.lifecycle("You've regenerated the TLD include file, remember to regenerate UAX29URLEmailTokenizerImpl too.")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
task generateWikipediaTokenizer(type: JFlexTask) {
|
task generateWikipediaTokenizer(type: JFlexTask) {
|
||||||
description "Regenerate WikipediaTokenizerImpl.java"
|
description "Regenerate WikipediaTokenizerImpl.java"
|
||||||
group "generation"
|
group "generation"
|
||||||
|
@ -77,6 +106,13 @@ configure(project(":lucene:analysis:common")) {
|
||||||
skeleton = skeletonNoBufferExpansion
|
skeleton = skeletonNoBufferExpansion
|
||||||
heapSize = "12g"
|
heapSize = "12g"
|
||||||
|
|
||||||
|
// Don't enforce strict dependency (although it would make sense to regenerate both).
|
||||||
|
// Just ensure we run after TLD regeneration, if it runs too.
|
||||||
|
mustRunAfter generateTlds
|
||||||
|
|
||||||
|
// Additional input (included by the source jflex file)
|
||||||
|
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
|
||||||
|
|
||||||
doFirst {
|
doFirst {
|
||||||
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
|
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
|
||||||
}
|
}
|
||||||
|
@ -121,7 +157,8 @@ configure(project(":lucene:analysis:common")) {
|
||||||
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizer, [ andThenTasks: "spotlessApply" ]),
|
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizer, [ andThenTasks: "spotlessApply" ]),
|
||||||
wrapWithPersistentChecksums(generateClassicTokenizer, [ andThenTasks: "spotlessApply" ]),
|
wrapWithPersistentChecksums(generateClassicTokenizer, [ andThenTasks: "spotlessApply" ]),
|
||||||
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizer, [ andThenTasks: "spotlessApply" ]),
|
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizer, [ andThenTasks: "spotlessApply" ]),
|
||||||
wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ])
|
wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ]),
|
||||||
|
wrapWithPersistentChecksums(generateTlds, [ andThenTasks: "spotlessApply" ])
|
||||||
}
|
}
|
||||||
|
|
||||||
class JFlexTask extends DefaultTask {
|
class JFlexTask extends DefaultTask {
|
||||||
|
|
|
@ -234,6 +234,9 @@ Bug fixes
|
||||||
|
|
||||||
Changes in Backwards Compatibility Policy
|
Changes in Backwards Compatibility Policy
|
||||||
|
|
||||||
|
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
|
||||||
|
level domains. This may change the token sequence compared to previous Lucene versions. (Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-9669: DirectoryReader#open now accepts an argument to open indices created with versions
|
* LUCENE-9669: DirectoryReader#open now accepts an argument to open indices created with versions
|
||||||
older than N-1. Lucene now can open indices created with a major version of N-2 in read-only mode.
|
older than N-1. Lucene now can open indices created with a major version of N-2 in read-only mode.
|
||||||
Opening an index created with a major version of N-2 with an IndexWriter is not supported.
|
Opening an index created with a major version of N-2 with an IndexWriter is not supported.
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "29b05df80da071a249ff68067d5833fd7e4a9ff4",
|
||||||
|
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "b346a80f511b64a59d556eb3ef58cf396e98c631"
|
||||||
|
}
|
|
@ -1,5 +1,6 @@
|
||||||
{
|
{
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "472a33bcf5741bc8923aaa2717000d9ccd62f1e2",
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2",
|
||||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
|
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
|
||||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ebeb76148ee02612841c463819f86d8932c9a7c3"
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "29b05df80da071a249ff68067d5833fd7e4a9ff4",
|
||||||
|
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "e437900d9570ca007f9c02c9ea286222b644c329"
|
||||||
}
|
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -132,7 +132,7 @@ ComplexContextEx = \p{LB:Complex_Context}
|
||||||
// RFC-5321: Simple Mail Transfer Protocol
|
// RFC-5321: Simple Mail Transfer Protocol
|
||||||
// RFC-5322: Internet Message Format
|
// RFC-5322: Internet Message Format
|
||||||
|
|
||||||
%include ASCIITLD.jflex-macro
|
%include ASCIITLD.jflex
|
||||||
|
|
||||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||||
DomainLabelSequence = {DomainLabel} ("." {DomainLabel})*
|
DomainLabelSequence = {DomainLabel} ("." {DomainLabel})*
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -40,6 +40,7 @@ import java.util.TreeMap;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
|
* Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
|
||||||
|
@ -53,16 +54,18 @@ import java.util.regex.Pattern;
|
||||||
public class GenerateJflexTLDMacros {
|
public class GenerateJflexTLDMacros {
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
|
if (args.length != 3 || args[0].equals("--help") || args[0].equals("-help")) {
|
||||||
System.err.println("Cmd line params:");
|
System.err.println("Cmd line params:");
|
||||||
System.err.println(
|
System.err.println(
|
||||||
"\tjava " + GenerateJflexTLDMacros.class.getName() + "<ZoneFileURL> <JFlexOutputFile>");
|
" java "
|
||||||
|
+ GenerateJflexTLDMacros.class.getName()
|
||||||
|
+ "<ZoneFileURL> <JFlexOutputFile> <TLDListFile>");
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
new GenerateJflexTLDMacros(args[0], args[1]).execute();
|
new GenerateJflexTLDMacros(args[0], args[1], args[2]).execute();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final String NL = System.getProperty("line.separator");
|
private static final String NL = "\n";
|
||||||
|
|
||||||
private static final String APACHE_LICENSE =
|
private static final String APACHE_LICENSE =
|
||||||
"/*"
|
"/*"
|
||||||
|
@ -103,16 +106,21 @@ public class GenerateJflexTLDMacros {
|
||||||
Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
|
Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
|
||||||
private final URL tldFileURL;
|
private final URL tldFileURL;
|
||||||
private long tldFileLastModified = -1L;
|
private long tldFileLastModified = -1L;
|
||||||
private final Path outputFile;
|
private final Path tldListFile;
|
||||||
|
private final Path jflexMacroFile;
|
||||||
|
|
||||||
private final SortedMap<String, Boolean> processedTLDsLongestFirst =
|
private final SortedMap<String, Boolean> processedTLDsLongestFirst =
|
||||||
new TreeMap<>(
|
new TreeMap<>(
|
||||||
Comparator.comparing(String::length).reversed().thenComparing(String::compareTo));
|
Comparator.comparing(String::length).reversed().thenComparing(String::compareTo));
|
||||||
private final List<SortedSet<String>> TLDsBySuffixLength =
|
|
||||||
new ArrayList<>(); // list position indicates suffix length
|
|
||||||
|
|
||||||
public GenerateJflexTLDMacros(String tldFileURL, String outputFile) throws Exception {
|
// list position indicates suffix length
|
||||||
|
private final List<SortedSet<String>> TLDsBySuffixLength = new ArrayList<>();
|
||||||
|
|
||||||
|
public GenerateJflexTLDMacros(String tldFileURL, String jflexFile, String tldListFile)
|
||||||
|
throws Exception {
|
||||||
this.tldFileURL = new URL(tldFileURL);
|
this.tldFileURL = new URL(tldFileURL);
|
||||||
this.outputFile = Paths.get(outputFile);
|
this.jflexMacroFile = Paths.get(jflexFile);
|
||||||
|
this.tldListFile = Paths.get(tldListFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -126,7 +134,7 @@ public class GenerateJflexTLDMacros {
|
||||||
getIANARootZoneDatabase();
|
getIANARootZoneDatabase();
|
||||||
partitionTLDprefixesBySuffixLength();
|
partitionTLDprefixesBySuffixLength();
|
||||||
writeOutput();
|
writeOutput();
|
||||||
System.out.println("Wrote TLD macros to '" + outputFile + "':");
|
System.out.println("Wrote TLD macros to '" + jflexMacroFile + "':");
|
||||||
int totalDomains = 0;
|
int totalDomains = 0;
|
||||||
for (int suffixLength = 0; suffixLength < TLDsBySuffixLength.size(); ++suffixLength) {
|
for (int suffixLength = 0; suffixLength < TLDsBySuffixLength.size(); ++suffixLength) {
|
||||||
int domainsAtThisSuffixLength = TLDsBySuffixLength.get(suffixLength).size();
|
int domainsAtThisSuffixLength = TLDsBySuffixLength.get(suffixLength).size();
|
||||||
|
@ -214,11 +222,19 @@ public class GenerateJflexTLDMacros {
|
||||||
* case-insensitively.
|
* case-insensitively.
|
||||||
*/
|
*/
|
||||||
private void writeOutput() throws IOException {
|
private void writeOutput() throws IOException {
|
||||||
|
Files.writeString(
|
||||||
|
tldListFile,
|
||||||
|
"# Generated from IANA Root Zone Database (gradlew generateTlds)."
|
||||||
|
+ processedTLDsLongestFirst.keySet().stream()
|
||||||
|
.sorted()
|
||||||
|
.collect(Collectors.joining("\n")),
|
||||||
|
StandardCharsets.UTF_8);
|
||||||
|
|
||||||
final DateFormat dateFormat =
|
final DateFormat dateFormat =
|
||||||
DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||||
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||||
try (Writer writer =
|
try (Writer writer =
|
||||||
new OutputStreamWriter(Files.newOutputStream(outputFile), StandardCharsets.UTF_8)) {
|
new OutputStreamWriter(Files.newOutputStream(jflexMacroFile), StandardCharsets.UTF_8)) {
|
||||||
writer.write(APACHE_LICENSE);
|
writer.write(APACHE_LICENSE);
|
||||||
writer.write("// Generated from IANA Root Zone Database <");
|
writer.write("// Generated from IANA Root Zone Database <");
|
||||||
writer.write(tldFileURL.toString());
|
writer.write(tldFileURL.toString());
|
||||||
|
@ -270,7 +286,7 @@ public class GenerateJflexTLDMacros {
|
||||||
|
|
||||||
boolean isFirst = true;
|
boolean isFirst = true;
|
||||||
for (String TLD : TLDs) {
|
for (String TLD : TLDs) {
|
||||||
writer.write("\t");
|
writer.write(" ");
|
||||||
if (isFirst) {
|
if (isFirst) {
|
||||||
isFirst = false;
|
isFirst = false;
|
||||||
writer.write(" ");
|
writer.write(" ");
|
||||||
|
@ -280,7 +296,7 @@ public class GenerateJflexTLDMacros {
|
||||||
writer.write(getCaseInsensitiveRegex(TLD));
|
writer.write(getCaseInsensitiveRegex(TLD));
|
||||||
writer.write(NL);
|
writer.write(NL);
|
||||||
}
|
}
|
||||||
writer.write("\t) \".\"? // Accept trailing root (empty) domain");
|
writer.write(" ) \".\"? // Accept trailing root (empty) domain");
|
||||||
writer.write(NL);
|
writer.write(NL);
|
||||||
writer.write(NL);
|
writer.write(NL);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue