mirror of https://github.com/apache/lucene.git
LUCENE-9924: generate TLD list from IANA TLD db, rather than root zone db (#77)
This adds a bit of simplicity as the file is a simple domain list, rather than a DNS zone. So the regexes parsing DNS can be removed. Also the file may change less often as it contains JUST the list of TLDs, and not any additional DNS metadata.
This commit is contained in:
parent
f33335157d
commit
b0bd64c620
|
@ -54,7 +54,7 @@ configure(project(":lucene:core")) {
|
|||
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
task generateTlds() {
|
||||
def tldZones = "https://www.internic.net/zones/root.zone"
|
||||
def tldZones = "https://data.iana.org/TLD/tlds-alpha-by-domain.txt"
|
||||
def jflexMacro = file("src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex")
|
||||
def tldList = file("src/test/org/apache/lucene/analysis/email/TLDs.txt")
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b1af8dc8532d853fcf1acde1f6c629750b296b40",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "b346a80f511b64a59d556eb3ef58cf396e98c631"
|
||||
}
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "41ecfd19595aaf19fe2ddffd7dadb26202e98fae",
|
||||
"lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TLDs.txt": "1c5a201efff431be1c62150aa6bd3dac0f3a21e2"
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "68263ff0a014904c6e89b040d868d8f399408908",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "b1af8dc8532d853fcf1acde1f6c629750b296b40",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "41ecfd19595aaf19fe2ddffd7dadb26202e98fae",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "e437900d9570ca007f9c02c9ea286222b644c329",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "56a751d27e481fb55388f91ebf34f5a0cb8cb1b2"
|
||||
}
|
||||
}
|
|
@ -14,8 +14,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
// Generated from IANA Root Zone Database <https://www.internic.net/zones/root.zone>
|
||||
// file version from 2021 Apr 10, Sat 17:37:00 Coordinated Universal Time
|
||||
// Generated from IANA TLD Database <https://data.iana.org/TLD/tlds-alpha-by-domain.txt>
|
||||
// file version from 2021 Apr 10, Sat 07:07:01 Coordinated Universal Time
|
||||
// generated by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
// LUCENE-8278: None of the TLDs in {ASCIITLD} is a 1-character-shorter prefix of another TLD
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Generated from IANA Root Zone Database (gradlew generateTlds).aaa
|
||||
# Generated from IANA TLD Database (gradlew generateTlds).aaa
|
||||
aarp
|
||||
abarth
|
||||
abb
|
||||
|
|
|
@ -37,18 +37,15 @@ import java.util.SortedSet;
|
|||
import java.util.TimeZone;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Generates a file containing JFlex macros to accept valid ASCII TLDs (top level domains), for
|
||||
* inclusion in JFlex grammars that can accept domain names.
|
||||
*
|
||||
* <p>The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the response is
|
||||
* parsed, and the results are written out to a file containing a JFlex macro that will accept all
|
||||
* valid ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline
|
||||
* arg #1).
|
||||
* <p>The IANA TLD Database is queried via HTTP from URL cmdline arg #0, the response is parsed, and
|
||||
* the results are written out to a file containing a JFlex macro that will accept all valid
|
||||
* ASCII-only TLDs, including punycode forms of internationalized TLDs (output file cmdline arg #1).
|
||||
*/
|
||||
public class GenerateJflexTLDMacros {
|
||||
|
||||
|
@ -100,9 +97,6 @@ public class GenerateJflexTLDMacros {
|
|||
+ " */"
|
||||
+ NL;
|
||||
|
||||
private static final Pattern TLD_PATTERN_1 = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
|
||||
private static final Pattern TLD_PATTERN_2 =
|
||||
Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
|
||||
private final URL tldFileURL;
|
||||
private long tldFileLastModified = -1L;
|
||||
private final Path tldListFile;
|
||||
|
@ -123,14 +117,14 @@ public class GenerateJflexTLDMacros {
|
|||
}
|
||||
|
||||
/**
|
||||
* Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then writes a set of JFlex
|
||||
* macros accepting any of them case-insensitively out to the specified output file.
|
||||
* Downloads the IANA TLD Database, extracts the ASCII TLDs, then writes a set of JFlex macros
|
||||
* accepting any of them case-insensitively out to the specified output file.
|
||||
*
|
||||
* @throws IOException if there is a problem either downloading the database or writing out the
|
||||
* output file.
|
||||
*/
|
||||
public void execute() throws IOException {
|
||||
getIANARootZoneDatabase();
|
||||
getIANATLDDatabase();
|
||||
partitionTLDprefixesBySuffixLength();
|
||||
writeOutput();
|
||||
System.out.println("Wrote TLD macros to '" + jflexMacroFile + "':");
|
||||
|
@ -145,11 +139,11 @@ public class GenerateJflexTLDMacros {
|
|||
}
|
||||
|
||||
/**
|
||||
* Downloads the IANA Root Zone Database.
|
||||
* Downloads the IANA TLD Database.
|
||||
*
|
||||
* @throws java.io.IOException if there is a problem downloading the database
|
||||
*/
|
||||
private void getIANARootZoneDatabase() throws IOException {
|
||||
private void getIANATLDDatabase() throws IOException {
|
||||
final URLConnection connection = tldFileURL.openConnection();
|
||||
connection.setUseCaches(false);
|
||||
connection.addRequestProperty("Cache-Control", "no-cache");
|
||||
|
@ -160,23 +154,16 @@ public class GenerateJflexTLDMacros {
|
|||
new InputStreamReader(connection.getInputStream(), StandardCharsets.US_ASCII))) {
|
||||
String line;
|
||||
while (null != (line = reader.readLine())) {
|
||||
Matcher matcher = TLD_PATTERN_1.matcher(line);
|
||||
if (matcher.matches()) {
|
||||
// System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
|
||||
processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
|
||||
} else {
|
||||
matcher = TLD_PATTERN_2.matcher(line);
|
||||
if (matcher.matches()) {
|
||||
// System.out.println("Found: " + matcher.group(1).toLowerCase(Locale.ROOT));
|
||||
processedTLDsLongestFirst.put(matcher.group(1).toLowerCase(Locale.ROOT), Boolean.FALSE);
|
||||
}
|
||||
if (line.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
processedTLDsLongestFirst.put(line.toLowerCase(Locale.ROOT), Boolean.FALSE);
|
||||
}
|
||||
}
|
||||
System.out.println(
|
||||
"Found "
|
||||
+ processedTLDsLongestFirst.size()
|
||||
+ " TLDs in IANA Root Zone Database at "
|
||||
+ " TLDs in IANA TLD Database at "
|
||||
+ tldFileURL);
|
||||
}
|
||||
|
||||
|
@ -223,7 +210,7 @@ public class GenerateJflexTLDMacros {
|
|||
private void writeOutput() throws IOException {
|
||||
Files.writeString(
|
||||
tldListFile,
|
||||
"# Generated from IANA Root Zone Database (gradlew generateTlds)."
|
||||
"# Generated from IANA TLD Database (gradlew generateTlds)."
|
||||
+ processedTLDsLongestFirst.keySet().stream()
|
||||
.sorted()
|
||||
.collect(Collectors.joining("\n")),
|
||||
|
@ -235,7 +222,7 @@ public class GenerateJflexTLDMacros {
|
|||
try (Writer writer =
|
||||
new OutputStreamWriter(Files.newOutputStream(jflexMacroFile), StandardCharsets.UTF_8)) {
|
||||
writer.write(APACHE_LICENSE);
|
||||
writer.write("// Generated from IANA Root Zone Database <");
|
||||
writer.write("// Generated from IANA TLD Database <");
|
||||
writer.write(tldFileURL.toString());
|
||||
writer.write(">");
|
||||
writer.write(NL);
|
||||
|
|
Loading…
Reference in New Issue