HTTPCLIENT-2336: Updated PublicSuffixMatcher to use 'formal algorithm' as specified https://github.com/publicsuffix/list/wiki/Format#formal-algorithm

This commit is contained in:
Adrian Woodhead 2024-07-05 15:04:06 +01:00 committed by Oleg Kalnichevski
parent 66a355c991
commit 715ec11060
5 changed files with 176 additions and 39 deletions

View File

@ -128,7 +128,6 @@ public final class PublicSuffixMatcher {
* @param domain
* @param expectedType expected domain type or {@code null} if any.
* @return domain root
*
* @since 4.5
*/
public String getDomainRoot(final String domain, final DomainType expectedType) {
@ -149,24 +148,30 @@ public final class PublicSuffixMatcher {
}
final DomainType domainRule = findEntry(rules, key);
if (match(domainRule, expectedType)) {
if (domainRule == DomainType.PRIVATE) {
return segment;
}
// Prior to version 5.4 the result for "private" rules was different. However, the
// PSL algorithm doesn't have any rules changing the result based on "domain type"
// see https://github.com/publicsuffix/list/wiki/Format#formal-algorithm
return result;
}
final int nextdot = segment.indexOf('.');
final String nextSegment = nextdot != -1 ? segment.substring(nextdot + 1) : null;
if (nextSegment != null) {
final DomainType wildcardDomainRule = findEntry(rules, "*." + IDN.toUnicode(nextSegment));
if (match(wildcardDomainRule, expectedType)) {
if (wildcardDomainRule == DomainType.PRIVATE) {
return segment;
}
return result;
}
// look for wildcard entries
final String wildcardKey = (nextSegment == null) ? "*" : "*." + IDN.toUnicode(nextSegment);
final DomainType wildcardDomainRule = findEntry(rules, wildcardKey);
if (match(wildcardDomainRule, expectedType)) {
return result;
}
// If we're out of segments, and we're not looking for a specific type of entry,
// apply the default `*` rule.
// This wildcard rule means any final segment in a domain is a public suffix,
// so the current `result` is the desired public suffix plus 1
if (nextSegment == null && (expectedType == null || expectedType == DomainType.UNKNOWN)) {
return result;
}
result = segment;
segment = nextSegment;
}
@ -181,7 +186,7 @@ public final class PublicSuffixMatcher {
}
/**
* Tests whether the given domain matches any of entry from the public suffix list.
* Tests whether the given domain matches any of the entries from the public suffix list.
*/
public boolean matches(final String domain) {
return matches(domain, null);

View File

@ -111,7 +111,7 @@ class TestPublicSuffixListParser {
cookie.setDomain(".blah");
cookie.setAttribute(Cookie.DOMAIN_ATTR, ".blah");
Assertions.assertTrue(filter.match(cookie, new CookieOrigin("somehost.blah", 80, "/stuff", false)));
Assertions.assertFalse(filter.match(cookie, new CookieOrigin("somehost.blah", 80, "/stuff", false)));
}
@Test

View File

@ -29,6 +29,7 @@ package org.apache.hc.client5.http.psl;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.List;
@ -39,50 +40,77 @@ import org.junit.jupiter.api.Test;
class TestPublicSuffixMatcher {
private static final String SOURCE_FILE = "suffixlistmatcher.txt";
private static final String PUBLIC_SUFFIX_LIST_FILE = "mozilla/public-suffix-list.txt";
private PublicSuffixMatcher matcher;
private PublicSuffixMatcher pslMatcher;
@BeforeEach
void setUp() throws Exception {
final ClassLoader classLoader = getClass().getClassLoader();
// Create a matcher using a custom crafted public suffix list file
final InputStream in = classLoader.getResourceAsStream(SOURCE_FILE);
Assertions.assertNotNull(in);
final List<PublicSuffixList> lists = PublicSuffixListParser.INSTANCE.parseByType(
new InputStreamReader(in, StandardCharsets.UTF_8));
matcher = new PublicSuffixMatcher(lists);
// Create a matcher using the public suffix list file provided by Mozilla
// Note: the test requires `mvn generate-resources` to have been called to fetch the Mozilla file into
// target/classes so that it is on the classpath
final URL publicSuffixListUrl = classLoader.getResource(PUBLIC_SUFFIX_LIST_FILE);
pslMatcher = PublicSuffixMatcherLoader.load(publicSuffixListUrl);
}
@Test
void testGetDomainRootAnyType() {
// ICANN
Assertions.assertEquals(null, matcher.getDomainRoot("com"));
Assertions.assertEquals("blah.com", matcher.getDomainRoot("blah.com"));
Assertions.assertEquals("foo.com", matcher.getDomainRoot("foo.com"));
Assertions.assertEquals(null, matcher.getDomainRoot("blah.foo.com"));
Assertions.assertEquals(null, matcher.getDomainRoot("booh.foo.com"));
Assertions.assertEquals("blah.blah.foo.com", matcher.getDomainRoot("blah.blah.foo.com"));
Assertions.assertEquals(null, matcher.getDomainRoot("kioto.jp"));
Assertions.assertEquals(null, matcher.getDomainRoot("tokyo.jp"));
Assertions.assertEquals(null, matcher.getDomainRoot("blah.tokyo.jp"));
Assertions.assertEquals(null, matcher.getDomainRoot("booh.tokyo.jp"));
Assertions.assertEquals("blah.blah.tokyo.jp", matcher.getDomainRoot("blah.blah.tokyo.jp"));
Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp"));
Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.ac.jp"));
Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.blah.ac.jp"));
Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp"));
// Private
Assertions.assertEquals("xx", matcher.getDomainRoot("example.XX"));
Assertions.assertEquals("xx", matcher.getDomainRoot("www.example.XX"));
Assertions.assertEquals("xx", matcher.getDomainRoot("www.blah.blah.example.XX"));
Assertions.assertEquals("appspot.com", matcher.getDomainRoot("example.appspot.com"));
Assertions.assertEquals("example.xx", matcher.getDomainRoot("example.XX"));
Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.example.XX"));
Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.blah.blah.example.XX"));
Assertions.assertEquals(null, matcher.getDomainRoot("appspot.com"));
Assertions.assertEquals("example.appspot.com", matcher.getDomainRoot("example.appspot.com"));
// Too short
Assertions.assertNull(matcher.getDomainRoot("jp"));
Assertions.assertNull(matcher.getDomainRoot("ac.jp"));
Assertions.assertNull(matcher.getDomainRoot("any.tokyo.jp"));
// ICANN
Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp"));
Assertions.assertEquals("blah.blah.tokyo.jp", matcher.getDomainRoot("blah.blah.tokyo.jp"));
Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.blah.ac.jp"));
// Unknown
Assertions.assertEquals("garbage", matcher.getDomainRoot("garbage"));
Assertions.assertEquals("garbage", matcher.getDomainRoot("garbage.garbage"));
Assertions.assertEquals("garbage", matcher.getDomainRoot("*.garbage.garbage"));
Assertions.assertEquals("garbage", matcher.getDomainRoot("*.garbage.garbage.garbage"));
Assertions.assertEquals(null, matcher.getDomainRoot("garbage"));
Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("garbage.garbage"));
Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("*.garbage.garbage"));
Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("*.garbage.garbage.garbage"));
Assertions.assertEquals("*.compute-1.amazonaws.com", matcher.getDomainRoot("*.compute-1.amazonaws.com"));
Assertions.assertEquals(null, matcher.getDomainRoot("*.compute-1.amazonaws.com"));
Assertions.assertEquals(null, matcher.getDomainRoot("blah.compute-1.amazonaws.com"));
Assertions.assertEquals("blah.blah.compute-1.amazonaws.com", matcher.getDomainRoot("blah.blah.compute-1.amazonaws.com"));
}
@Test
void testGetDomainRootOnlyPRIVATE() {
// Private
Assertions.assertEquals("xx", matcher.getDomainRoot("example.XX", DomainType.PRIVATE));
Assertions.assertEquals("xx", matcher.getDomainRoot("www.example.XX", DomainType.PRIVATE));
Assertions.assertEquals("xx", matcher.getDomainRoot("www.blah.blah.example.XX", DomainType.PRIVATE));
Assertions.assertEquals("appspot.com", matcher.getDomainRoot("example.appspot.com"));
Assertions.assertEquals("example.xx", matcher.getDomainRoot("example.XX", DomainType.PRIVATE));
Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.example.XX", DomainType.PRIVATE));
Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.blah.blah.example.XX", DomainType.PRIVATE));
Assertions.assertEquals("example.appspot.com", matcher.getDomainRoot("example.appspot.com"));
// Too short
Assertions.assertNull(matcher.getDomainRoot("jp", DomainType.PRIVATE));
Assertions.assertNull(matcher.getDomainRoot("ac.jp", DomainType.PRIVATE));
@ -120,16 +148,15 @@ class TestPublicSuffixMatcher {
Assertions.assertNull(matcher.getDomainRoot("*.garbage.garbage.garbage", DomainType.ICANN));
}
@Test
void testMatch() {
Assertions.assertTrue(matcher.matches(".jp"));
Assertions.assertTrue(matcher.matches(".ac.jp"));
Assertions.assertTrue(matcher.matches(".any.tokyo.jp"));
Assertions.assertTrue(matcher.matches(".xx"));
Assertions.assertTrue(matcher.matches(".appspot.com"));
// exception
Assertions.assertFalse(matcher.matches(".metro.tokyo.jp"));
Assertions.assertFalse(matcher.matches(".xx"));
Assertions.assertFalse(matcher.matches(".appspot.com"));
}
@Test
@ -140,4 +167,108 @@ class TestPublicSuffixMatcher {
Assertions.assertTrue(matcher.matches(".xn--h-2fa.no"));
}
private void checkPublicSuffix(final String input, final String expected) {
Assertions.assertEquals(expected, pslMatcher.getDomainRoot(input));
}
//see https://github.com/publicsuffix/list/blob/master/tests/test_psl.txt
@Test
void testGetDomainRootPublicSuffixList() {
// null input.
checkPublicSuffix(null, null);
// Mixed case.
checkPublicSuffix("COM", null);
checkPublicSuffix("example.COM", "example.com");
checkPublicSuffix("WwW.example.COM", "example.com");
// Leading dot.
checkPublicSuffix(".com", null);
checkPublicSuffix(".example", null);
checkPublicSuffix(".example.com", null);
checkPublicSuffix(".example.example", null);
// Unlisted TLD.
checkPublicSuffix("example", null);
checkPublicSuffix("example.example", "example.example");
checkPublicSuffix("b.example.example", "example.example");
checkPublicSuffix("a.b.example.example", "example.example");
// Listed, but non-Internet, TLD.
//checkPublicSuffix("local", null);
//checkPublicSuffix("example.local", null);
//checkPublicSuffix("b.example.local", null);
//checkPublicSuffix("a.b.example.local", null);
// TLD with only 1 rule.
checkPublicSuffix("biz", null);
checkPublicSuffix("domain.biz", "domain.biz");
checkPublicSuffix("b.domain.biz", "domain.biz");
checkPublicSuffix("a.b.domain.biz", "domain.biz");
// TLD with some 2-level rules.
checkPublicSuffix("com", null);
checkPublicSuffix("example.com", "example.com");
checkPublicSuffix("b.example.com", "example.com");
checkPublicSuffix("a.b.example.com", "example.com");
checkPublicSuffix("uk.com", null);
checkPublicSuffix("example.uk.com", "example.uk.com");
checkPublicSuffix("b.example.uk.com", "example.uk.com");
checkPublicSuffix("a.b.example.uk.com", "example.uk.com");
checkPublicSuffix("test.ac", "test.ac");
// TLD with only 1 (wildcard) rule.
checkPublicSuffix("mm", null);
checkPublicSuffix("c.mm", null);
checkPublicSuffix("b.c.mm", "b.c.mm");
checkPublicSuffix("a.b.c.mm", "b.c.mm");
// More complex TLD.
checkPublicSuffix("jp", null);
checkPublicSuffix("test.jp", "test.jp");
checkPublicSuffix("www.test.jp", "test.jp");
checkPublicSuffix("ac.jp", null);
checkPublicSuffix("test.ac.jp", "test.ac.jp");
checkPublicSuffix("www.test.ac.jp", "test.ac.jp");
checkPublicSuffix("kyoto.jp", null);
checkPublicSuffix("test.kyoto.jp", "test.kyoto.jp");
checkPublicSuffix("ide.kyoto.jp", null);
checkPublicSuffix("b.ide.kyoto.jp", "b.ide.kyoto.jp");
checkPublicSuffix("a.b.ide.kyoto.jp", "b.ide.kyoto.jp");
checkPublicSuffix("c.kobe.jp", null);
checkPublicSuffix("b.c.kobe.jp", "b.c.kobe.jp");
checkPublicSuffix("a.b.c.kobe.jp", "b.c.kobe.jp");
checkPublicSuffix("city.kobe.jp", "city.kobe.jp");
checkPublicSuffix("www.city.kobe.jp", "city.kobe.jp");
// TLD with a wildcard rule and exceptions.
checkPublicSuffix("ck", null);
checkPublicSuffix("test.ck", null);
checkPublicSuffix("b.test.ck", "b.test.ck");
checkPublicSuffix("a.b.test.ck", "b.test.ck");
checkPublicSuffix("www.ck", "www.ck");
checkPublicSuffix("www.www.ck", "www.ck");
// US K12.
checkPublicSuffix("us", null);
checkPublicSuffix("test.us", "test.us");
checkPublicSuffix("www.test.us", "test.us");
checkPublicSuffix("ak.us", null);
checkPublicSuffix("test.ak.us", "test.ak.us");
checkPublicSuffix("www.test.ak.us", "test.ak.us");
checkPublicSuffix("k12.ak.us", null);
checkPublicSuffix("test.k12.ak.us", "test.k12.ak.us");
checkPublicSuffix("www.test.k12.ak.us", "test.k12.ak.us");
// IDN labels.
checkPublicSuffix("食狮.com.cn", "食狮.com.cn");
checkPublicSuffix("食狮.公司.cn", "食狮.公司.cn");
checkPublicSuffix("www.食狮.公司.cn", "食狮.公司.cn");
checkPublicSuffix("shishi.公司.cn", "shishi.公司.cn");
checkPublicSuffix("公司.cn", null);
checkPublicSuffix("食狮.中国", "食狮.中国");
checkPublicSuffix("www.食狮.中国", "食狮.中国");
checkPublicSuffix("shishi.中国", "shishi.中国");
checkPublicSuffix("中国", null);
// Same as above, but punycoded.
checkPublicSuffix("xn--85x722f.com.cn", "xn--85x722f.com.cn");
checkPublicSuffix("xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
checkPublicSuffix("www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
checkPublicSuffix("shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn");
checkPublicSuffix("xn--55qx5d.cn", null);
checkPublicSuffix("xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
checkPublicSuffix("www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
checkPublicSuffix("shishi.xn--fiqs8s", "shishi.xn--fiqs8s");
checkPublicSuffix("xn--fiqs8s", null);
}
}

View File

@ -276,11 +276,11 @@ class TestDefaultHostnameVerifier {
Assertions.assertTrue(DefaultHostnameVerifier.matchIdentity("a.b.xxx.uk", "a.b.xxx.uk", publicSuffixMatcher));
Assertions.assertTrue(DefaultHostnameVerifier.matchIdentityStrict("a.b.xxx.uk", "a.b.xxx.uk", publicSuffixMatcher));
Assertions.assertTrue(DefaultHostnameVerifier.matchIdentity("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher));
Assertions.assertTrue(DefaultHostnameVerifier.matchIdentityStrict("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher));
Assertions.assertFalse(DefaultHostnameVerifier.matchIdentity("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher));
Assertions.assertFalse(DefaultHostnameVerifier.matchIdentityStrict("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher));
Assertions.assertTrue(DefaultHostnameVerifier.matchIdentity("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher));
Assertions.assertTrue(DefaultHostnameVerifier.matchIdentityStrict("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher));
Assertions.assertFalse(DefaultHostnameVerifier.matchIdentity("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher));
Assertions.assertFalse(DefaultHostnameVerifier.matchIdentityStrict("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher));
Assertions.assertFalse(DefaultHostnameVerifier.matchIdentity("b.xxx.uk", "*.xxx.uk", publicSuffixMatcher));
Assertions.assertFalse(DefaultHostnameVerifier.matchIdentityStrict("b.xxx.uk", "*.xxx.uk", publicSuffixMatcher));

View File

@ -39,13 +39,14 @@ us-east-1.amazonaws.com
// ===BEGIN ICANN DOMAINS===
jp
ac.jp
*.jp
*.tokyo.jp
!metro.tokyo.jp
com
co.jp
gov.uk
*.foo.com
// unicode
no