LUCENE-9924: generate TLD list from IANA TLD db, rather than root zone db (#77)

This adds a bit of simplicity as the file is a simple domain list,
rather than a DNS zone. So the regexes parsing DNS can be removed.

Also the file may change less often as it contains JUST the list of
TLDs, and not any additional DNS metadata.
This commit is contained in:
Robert Muir 2021-04-11 11:25:15 -04:00 committed by GitHub
parent f33335157d
commit b0bd64c620
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 23 additions and 36 deletions

View File

@ -54,7 +54,7 @@ configure(project(":lucene:core")) {
configure(project(":lucene:analysis:common")) {
task generateTlds() {
def tldZones = "https://www.internic.net/zones/root.zone"
def tldZones = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")

View File

@ -1,4 +1,4 @@
{
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b1af8dc8532d853fcf1acde1f6c629750b296b40",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "b346a80f511b64a59d556eb3ef58cf396e98c631"
}
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "41ecfd19595aaf19fe2ddffd7dadb26202e98fae",
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "1c5a201efff431be1c62150aa6bd3dac0f3a21e2"
}

View File

@ -1,6 +1,6 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b1af8dc8532d853fcf1acde1f6c629750b296b40",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "41ecfd19595aaf19fe2ddffd7dadb26202e98fae",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "e437900d9570ca007f9c02c9ea286222b644c329",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2"
}
}

View File

@ -14,8 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <https://www.internic.net/zones/root.zone>
// file version from 2021 Apr 10, Sat 17:37:00 Coordinated Universal Time
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
// file version from 2021 Apr 10, Sat 07:07:01 Coordinated Universal Time
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD

View File

@ -1,4 +1,4 @@
# Generated from IANA Root Zone Database (gradlew generateTlds).aaa
# Generated from IANA TLD Database (gradlew generateTlds).aaa
aarp
abarth
abb

View File

@ -37,18 +37,15 @@ import java.util.SortedSet;
import java.util.TimeZone;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
* inclusion in JFlex grammars that can accept domain names.
*
* <p>The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the response is
* parsed, and the results are written out to a file containing a JFlex macro that will accept all
* valid ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline
* arg #1).
* <p>The IANA TLD Database is queried via HTTP from URL cmdline arg #0, the response is parsed, and
* the results are written out to a file containing a JFlex macro that will accept all valid
* ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline arg #1).
*/
public class GenerateJflexTLDMacros {
@ -100,9 +97,6 @@ public class GenerateJflexTLDMacros {
+ " */"
+ NL;
private static final Pattern TLD_PATTERN_1 = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
private static final Pattern TLD_PATTERN_2 =
Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
private final URL tldFileURL;
private long tldFileLastModified = -1L;
private final Path tldListFile;
@ -123,14 +117,14 @@ public class GenerateJflexTLDMacros {
}
/**
* Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then writes a set of JFlex
* macros accepting any of them case-insensitively out to the specified output file.
* Downloads the IANA TLD Database, extracts the ASCII TLDs, then writes a set of JFlex macros
* accepting any of them case-insensitively out to the specified output file.
*
* @throws IOException if there is a problem either downloading the database or writing out the
* output file.
*/
public void execute() throws IOException {
getIANARootZoneDatabase();
getIANATLDDatabase();
partitionTLDprefixesBySuffixLength();
writeOutput();
System.out.println("Wrote TLD macros to '" + jflexMacroFile + "':");
@ -145,11 +139,11 @@ public class GenerateJflexTLDMacros {
}
/**
* Downloads the IANA Root Zone Database.
* Downloads the IANA TLD Database.
*
* @throws java.io.IOException if there is a problem downloading the database
*/
private void getIANARootZoneDatabase() throws IOException {
private void getIANATLDDatabase() throws IOException {
final URLConnection connection = tldFileURL.openConnection();
connection.setUseCaches(false);
connection.addRequestProperty("Cache-Control", "no-cache");
@ -160,23 +154,16 @@ public class GenerateJflexTLDMacros {
new InputStreamReader(connection.getInputStream(), StandardCharsets.US_ASCII))) {
String line;
while (null != (line = reader.readLine())) {
Matcher matcher = TLD_PATTERN_1.matcher(line);
if (matcher.matches()) {
// System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
} else {
matcher = TLD_PATTERN_2.matcher(line);
if (matcher.matches()) {
// System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
}
if (line.startsWith("#")) {
continue;
}
processedTLDsLongestFirst.put(line.toLowerCase(Locale.ROOT), Boolean.FALSE);
}
}
System.out.println(
"Found "
+ processedTLDsLongestFirst.size()
+ " TLDs in IANA Root Zone Database at "
+ " TLDs in IANA TLD Database at "
+ tldFileURL);
}
@ -223,7 +210,7 @@ public class GenerateJflexTLDMacros {
private void writeOutput() throws IOException {
Files.writeString(
tldListFile,
"# Generated from IANA Root Zone Database (gradlew generateTlds)."
"# Generated from IANA TLD Database (gradlew generateTlds)."
+ processedTLDsLongestFirst.keySet().stream()
.sorted()
.collect(Collectors.joining("\n")),
@ -235,7 +222,7 @@ public class GenerateJflexTLDMacros {
try (Writer writer =
new OutputStreamWriter(Files.newOutputStream(jflexMacroFile), StandardCharsets.UTF_8)) {
writer.write(APACHE_LICENSE);
writer.write("// Generated from IANA Root Zone Database <");
writer.write("// Generated from IANA TLD Database <");
writer.write(tldFileURL.toString());
writer.write(">");
writer.write(NL);