mirror of https://github.com/apache/lucene.git
LUCENE-9904: Port GenerateJflexTLDMacros.java regeneration to gradle and regenerate UAX tokenizer with up-to-date TLDs
This commit is contained in:
parent
efeea0b8ee
commit
39071dbc54
|
@ -53,6 +53,35 @@ configure(project(":lucene:core")) {
|
|||
}
|
||||
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
task generateTlds() {
|
||||
def tldZones = "https://www.internic.net/zones/root.zone"
|
||||
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
|
||||
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
|
||||
|
||||
description "Regenerate top-level domain jflex macros and tests"
|
||||
group "generation"
|
||||
|
||||
dependsOn { sourceSets.tools.runtimeClasspath }
|
||||
|
||||
outputs.files jflexMacro, tldList
|
||||
|
||||
doFirst {
|
||||
project.javaexec {
|
||||
main = "org.apache.lucene.analysis.standard.GenerateJflexTLDMacros"
|
||||
classpath = sourceSets.tools.runtimeClasspath
|
||||
|
||||
ignoreExitValue false
|
||||
args = [
|
||||
tldZones,
|
||||
jflexMacro,
|
||||
tldList
|
||||
]
|
||||
}
|
||||
|
||||
logger.lifecycle("You've regenerated the TLD include file, remember to regenerate UAX29URLEmailTokenizerImpl too.")
|
||||
}
|
||||
}
|
||||
|
||||
task generateWikipediaTokenizer(type: JFlexTask) {
|
||||
description "Regenerate WikipediaTokenizerImpl.java"
|
||||
group "generation"
|
||||
|
@ -77,6 +106,13 @@ configure(project(":lucene:analysis:common")) {
|
|||
skeleton = skeletonNoBufferExpansion
|
||||
heapSize = "12g"
|
||||
|
||||
// Don't enforce strict dependency (although it would make sense to regenerate both).
|
||||
// Just ensure we run after TLD regeneration, if it runs too.
|
||||
mustRunAfter generateTlds
|
||||
|
||||
// Additional input (included by the source jflex file)
|
||||
inputs.file file('src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex')
|
||||
|
||||
doFirst {
|
||||
logger.lifecycle("Regenerating UAX29URLEmailTokenizerImpl. This may take a long time (and requires ${heapSize} of memory!).")
|
||||
}
|
||||
|
@ -121,7 +157,8 @@ configure(project(":lucene:analysis:common")) {
|
|||
regenerate.dependsOn wrapWithPersistentChecksums(generateWikipediaTokenizer, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateClassicTokenizer, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateUAX29URLEmailTokenizer, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ])
|
||||
wrapWithPersistentChecksums(generateHTMLStripCharFilter, [ andThenTasks: "spotlessApply" ]),
|
||||
wrapWithPersistentChecksums(generateTlds, [ andThenTasks: "spotlessApply" ])
|
||||
}
|
||||
|
||||
class JFlexTask extends DefaultTask {
|
||||
|
|
|
@ -234,6 +234,9 @@ Bug fixes
|
|||
|
||||
Changes in Backwards Compatibility Policy
|
||||
|
||||
* LUCENE-9904: regenerated UAX29URLEmailTokenizer and the corresponding analyzer with up-to-date top
|
||||
level domains. This may change the token sequence compared to previous Lucene versions. (Dawid Weiss)
|
||||
|
||||
* LUCENE-9669: DirectoryReader#open now accepts an argument to open indices created with versions
|
||||
older than N-1. Lucene now can open indices created with a major version of N-2 in read-only mode.
|
||||
Opening an index created with a major version of N-2 with an IndexWriter is not supported.
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
{
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "29b05df80da071a249ff68067d5833fd7e4a9ff4",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "b346a80f511b64a59d556eb3ef58cf396e98c631"
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "472a33bcf5741bc8923aaa2717000d9ccd62f1e2",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2",
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "ebeb76148ee02612841c463819f86d8932c9a7c3"
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "29b05df80da071a249ff68067d5833fd7e4a9ff4",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "e437900d9570ca007f9c02c9ea286222b644c329"
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -132,7 +132,7 @@ ComplexContextEx = \p{LB:Complex_Context}
|
|||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include ASCIITLD.jflex-macro
|
||||
%include ASCIITLD.jflex
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainLabelSequence = {DomainLabel} ("." {DomainLabel})*
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -40,6 +40,7 @@ import java.util.TreeMap;
|
|||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
|
||||
|
@ -53,16 +54,18 @@ import java.util.regex.Pattern;
|
|||
public class GenerateJflexTLDMacros {
|
||||
|
||||
public static void main(String... args) throws Exception {
|
||||
if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
|
||||
if (args.length != 3 || args[0].equals("--help") || args[0].equals("-help")) {
|
||||
System.err.println("Cmd line params:");
|
||||
System.err.println(
|
||||
"\tjava " + GenerateJflexTLDMacros.class.getName() + "<ZoneFileURL> <JFlexOutputFile>");
|
||||
" java "
|
||||
+ GenerateJflexTLDMacros.class.getName()
|
||||
+ "<ZoneFileURL> <JFlexOutputFile> <TLDListFile>");
|
||||
System.exit(1);
|
||||
}
|
||||
new GenerateJflexTLDMacros(args[0], args[1]).execute();
|
||||
new GenerateJflexTLDMacros(args[0], args[1], args[2]).execute();
|
||||
}
|
||||
|
||||
private static final String NL = System.getProperty("line.separator");
|
||||
private static final String NL = "\n";
|
||||
|
||||
private static final String APACHE_LICENSE =
|
||||
"/*"
|
||||
|
@ -103,16 +106,21 @@ public class GenerateJflexTLDMacros {
|
|||
Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
|
||||
private final URL tldFileURL;
|
||||
private long tldFileLastModified = -1L;
|
||||
private final Path outputFile;
|
||||
private final Path tldListFile;
|
||||
private final Path jflexMacroFile;
|
||||
|
||||
private final SortedMap<String, Boolean> processedTLDsLongestFirst =
|
||||
new TreeMap<>(
|
||||
Comparator.comparing(String::length).reversed().thenComparing(String::compareTo));
|
||||
private final List<SortedSet<String>> TLDsBySuffixLength =
|
||||
new ArrayList<>(); // list position indicates suffix length
|
||||
|
||||
public GenerateJflexTLDMacros(String tldFileURL, String outputFile) throws Exception {
|
||||
// list position indicates suffix length
|
||||
private final List<SortedSet<String>> TLDsBySuffixLength = new ArrayList<>();
|
||||
|
||||
public GenerateJflexTLDMacros(String tldFileURL, String jflexFile, String tldListFile)
|
||||
throws Exception {
|
||||
this.tldFileURL = new URL(tldFileURL);
|
||||
this.outputFile = Paths.get(outputFile);
|
||||
this.jflexMacroFile = Paths.get(jflexFile);
|
||||
this.tldListFile = Paths.get(tldListFile);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -126,7 +134,7 @@ public class GenerateJflexTLDMacros {
|
|||
getIANARootZoneDatabase();
|
||||
partitionTLDprefixesBySuffixLength();
|
||||
writeOutput();
|
||||
System.out.println("Wrote TLD macros to '" + outputFile + "':");
|
||||
System.out.println("Wrote TLD macros to '" + jflexMacroFile + "':");
|
||||
int totalDomains = 0;
|
||||
for (int suffixLength = 0; suffixLength < TLDsBySuffixLength.size(); ++suffixLength) {
|
||||
int domainsAtThisSuffixLength = TLDsBySuffixLength.get(suffixLength).size();
|
||||
|
@ -214,11 +222,19 @@ public class GenerateJflexTLDMacros {
|
|||
* case-insensitively.
|
||||
*/
|
||||
private void writeOutput() throws IOException {
|
||||
Files.writeString(
|
||||
tldListFile,
|
||||
"# Generated from IANA Root Zone Database (gradlew generateTlds)."
|
||||
+ processedTLDsLongestFirst.keySet().stream()
|
||||
.sorted()
|
||||
.collect(Collectors.joining("\n")),
|
||||
StandardCharsets.UTF_8);
|
||||
|
||||
final DateFormat dateFormat =
|
||||
DateFormat.getDateTimeInstance(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
try (Writer writer =
|
||||
new OutputStreamWriter(Files.newOutputStream(outputFile), StandardCharsets.UTF_8)) {
|
||||
new OutputStreamWriter(Files.newOutputStream(jflexMacroFile), StandardCharsets.UTF_8)) {
|
||||
writer.write(APACHE_LICENSE);
|
||||
writer.write("// Generated from IANA Root Zone Database <");
|
||||
writer.write(tldFileURL.toString());
|
||||
|
@ -270,7 +286,7 @@ public class GenerateJflexTLDMacros {
|
|||
|
||||
boolean isFirst = true;
|
||||
for (String TLD : TLDs) {
|
||||
writer.write("\t");
|
||||
writer.write(" ");
|
||||
if (isFirst) {
|
||||
isFirst = false;
|
||||
writer.write(" ");
|
||||
|
@ -280,7 +296,7 @@ public class GenerateJflexTLDMacros {
|
|||
writer.write(getCaseInsensitiveRegex(TLD));
|
||||
writer.write(NL);
|
||||
}
|
||||
writer.write("\t) \".\"? // Accept trailing root (empty) domain");
|
||||
writer.write(" ) \".\"? // Accept trailing root (empty) domain");
|
||||
writer.write(NL);
|
||||
writer.write(NL);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue