From 715ec110600c2f0ee5c4fa03085306db175afba5 Mon Sep 17 00:00:00 2001 From: Adrian Woodhead Date: Fri, 5 Jul 2024 15:04:06 +0100 Subject: [PATCH] HTTPCLIENT-2336: Updated PublicSuffixMatcher to use 'formal algorithm' as specified https://github.com/publicsuffix/list/wiki/Format#formal-algorithm --- .../client5/http/psl/PublicSuffixMatcher.java | 31 ++-- .../cookie/TestPublicSuffixListParser.java | 2 +- .../http/psl/TestPublicSuffixMatcher.java | 171 ++++++++++++++++-- .../http/ssl/TestDefaultHostnameVerifier.java | 8 +- .../src/test/resources/suffixlistmatcher.txt | 3 +- 5 files changed, 176 insertions(+), 39 deletions(-) diff --git a/httpclient5/src/main/java/org/apache/hc/client5/http/psl/PublicSuffixMatcher.java b/httpclient5/src/main/java/org/apache/hc/client5/http/psl/PublicSuffixMatcher.java index 191d4d5b5..37c03c785 100644 --- a/httpclient5/src/main/java/org/apache/hc/client5/http/psl/PublicSuffixMatcher.java +++ b/httpclient5/src/main/java/org/apache/hc/client5/http/psl/PublicSuffixMatcher.java @@ -128,7 +128,6 @@ public String getDomainRoot(final String domain) { * @param domain * @param expectedType expected domain type or {@code null} if any. * @return domain root - * * @since 4.5 */ public String getDomainRoot(final String domain, final DomainType expectedType) { @@ -149,24 +148,30 @@ public String getDomainRoot(final String domain, final DomainType expectedType) } final DomainType domainRule = findEntry(rules, key); if (match(domainRule, expectedType)) { - if (domainRule == DomainType.PRIVATE) { - return segment; - } + // Prior to version 5.4 the result for "private" rules was different. However, the + // PSL algorithm doesn't have any rules changing the result based on "domain type" + // see https://github.com/publicsuffix/list/wiki/Format#formal-algorithm return result; } final int nextdot = segment.indexOf('.'); final String nextSegment = nextdot != -1 ? segment.substring(nextdot + 1) : null; - if (nextSegment != null) { - final DomainType wildcardDomainRule = findEntry(rules, "*." + IDN.toUnicode(nextSegment)); - if (match(wildcardDomainRule, expectedType)) { - if (wildcardDomainRule == DomainType.PRIVATE) { - return segment; - } - return result; - } + // look for wildcard entries + final String wildcardKey = (nextSegment == null) ? "*" : "*." + IDN.toUnicode(nextSegment); + final DomainType wildcardDomainRule = findEntry(rules, wildcardKey); + if (match(wildcardDomainRule, expectedType)) { + return result; } + + // If we're out of segments, and we're not looking for a specific type of entry, + // apply the default `*` rule. + // This wildcard rule means any final segment in a domain is a public suffix, + // so the current `result` is the desired public suffix plus 1 + if (nextSegment == null && (expectedType == null || expectedType == DomainType.UNKNOWN)) { + return result; + } + result = segment; segment = nextSegment; } @@ -181,7 +186,7 @@ public String getDomainRoot(final String domain, final DomainType expectedType) } /** - * Tests whether the given domain matches any of entry from the public suffix list. + * Tests whether the given domain matches any of the entries from the public suffix list. */ public boolean matches(final String domain) { return matches(domain, null); diff --git a/httpclient5/src/test/java/org/apache/hc/client5/http/impl/cookie/TestPublicSuffixListParser.java b/httpclient5/src/test/java/org/apache/hc/client5/http/impl/cookie/TestPublicSuffixListParser.java index aeb7a971b..60cf10556 100644 --- a/httpclient5/src/test/java/org/apache/hc/client5/http/impl/cookie/TestPublicSuffixListParser.java +++ b/httpclient5/src/test/java/org/apache/hc/client5/http/impl/cookie/TestPublicSuffixListParser.java @@ -111,7 +111,7 @@ void testParseLocal() { cookie.setDomain(".blah"); cookie.setAttribute(Cookie.DOMAIN_ATTR, ".blah"); - Assertions.assertTrue(filter.match(cookie, new CookieOrigin("somehost.blah", 80, "/stuff", false))); + Assertions.assertFalse(filter.match(cookie, new CookieOrigin("somehost.blah", 80, "/stuff", false))); } @Test diff --git a/httpclient5/src/test/java/org/apache/hc/client5/http/psl/TestPublicSuffixMatcher.java b/httpclient5/src/test/java/org/apache/hc/client5/http/psl/TestPublicSuffixMatcher.java index c47c9b967..1ecf6a483 100644 --- a/httpclient5/src/test/java/org/apache/hc/client5/http/psl/TestPublicSuffixMatcher.java +++ b/httpclient5/src/test/java/org/apache/hc/client5/http/psl/TestPublicSuffixMatcher.java @@ -29,6 +29,7 @@ import java.io.InputStream; import java.io.InputStreamReader; +import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.List; @@ -39,50 +40,77 @@ class TestPublicSuffixMatcher { private static final String SOURCE_FILE = "suffixlistmatcher.txt"; + private static final String PUBLIC_SUFFIX_LIST_FILE = "mozilla/public-suffix-list.txt"; private PublicSuffixMatcher matcher; + private PublicSuffixMatcher pslMatcher; @BeforeEach void setUp() throws Exception { final ClassLoader classLoader = getClass().getClassLoader(); + + // Create a matcher using a custom crafted public suffix list file final InputStream in = classLoader.getResourceAsStream(SOURCE_FILE); Assertions.assertNotNull(in); final List lists = PublicSuffixListParser.INSTANCE.parseByType( new InputStreamReader(in, StandardCharsets.UTF_8)); matcher = new PublicSuffixMatcher(lists); + + // Create a matcher using the public suffix list file provided by Mozilla + // Note: the test requires `mvn generate-resources` to have been called to fetch the Mozilla file into + // target/classes so that it is on the classpath + final URL publicSuffixListUrl = classLoader.getResource(PUBLIC_SUFFIX_LIST_FILE); + pslMatcher = PublicSuffixMatcherLoader.load(publicSuffixListUrl); } @Test void testGetDomainRootAnyType() { + // ICANN + Assertions.assertEquals(null, matcher.getDomainRoot("com")); + Assertions.assertEquals("blah.com", matcher.getDomainRoot("blah.com")); + Assertions.assertEquals("foo.com", matcher.getDomainRoot("foo.com")); + Assertions.assertEquals(null, matcher.getDomainRoot("blah.foo.com")); + Assertions.assertEquals(null, matcher.getDomainRoot("booh.foo.com")); + Assertions.assertEquals("blah.blah.foo.com", matcher.getDomainRoot("blah.blah.foo.com")); + + Assertions.assertEquals(null, matcher.getDomainRoot("kioto.jp")); + Assertions.assertEquals(null, matcher.getDomainRoot("tokyo.jp")); + Assertions.assertEquals(null, matcher.getDomainRoot("blah.tokyo.jp")); + Assertions.assertEquals(null, matcher.getDomainRoot("booh.tokyo.jp")); + Assertions.assertEquals("blah.blah.tokyo.jp", matcher.getDomainRoot("blah.blah.tokyo.jp")); + Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp")); + Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.ac.jp")); + Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.blah.ac.jp")); + Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp")); + // Private - Assertions.assertEquals("xx", matcher.getDomainRoot("example.XX")); - Assertions.assertEquals("xx", matcher.getDomainRoot("www.example.XX")); - Assertions.assertEquals("xx", matcher.getDomainRoot("www.blah.blah.example.XX")); - Assertions.assertEquals("appspot.com", matcher.getDomainRoot("example.appspot.com")); + Assertions.assertEquals("example.xx", matcher.getDomainRoot("example.XX")); + Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.example.XX")); + Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.blah.blah.example.XX")); + Assertions.assertEquals(null, matcher.getDomainRoot("appspot.com")); + Assertions.assertEquals("example.appspot.com", matcher.getDomainRoot("example.appspot.com")); // Too short Assertions.assertNull(matcher.getDomainRoot("jp")); Assertions.assertNull(matcher.getDomainRoot("ac.jp")); Assertions.assertNull(matcher.getDomainRoot("any.tokyo.jp")); - // ICANN - Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp")); - Assertions.assertEquals("blah.blah.tokyo.jp", matcher.getDomainRoot("blah.blah.tokyo.jp")); - Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.blah.ac.jp")); // Unknown - Assertions.assertEquals("garbage", matcher.getDomainRoot("garbage")); - Assertions.assertEquals("garbage", matcher.getDomainRoot("garbage.garbage")); - Assertions.assertEquals("garbage", matcher.getDomainRoot("*.garbage.garbage")); - Assertions.assertEquals("garbage", matcher.getDomainRoot("*.garbage.garbage.garbage")); + Assertions.assertEquals(null, matcher.getDomainRoot("garbage")); + Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("garbage.garbage")); + Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("*.garbage.garbage")); + Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("*.garbage.garbage.garbage")); - Assertions.assertEquals("*.compute-1.amazonaws.com", matcher.getDomainRoot("*.compute-1.amazonaws.com")); + Assertions.assertEquals(null, matcher.getDomainRoot("*.compute-1.amazonaws.com")); + Assertions.assertEquals(null, matcher.getDomainRoot("blah.compute-1.amazonaws.com")); + Assertions.assertEquals("blah.blah.compute-1.amazonaws.com", matcher.getDomainRoot("blah.blah.compute-1.amazonaws.com")); } @Test void testGetDomainRootOnlyPRIVATE() { // Private - Assertions.assertEquals("xx", matcher.getDomainRoot("example.XX", DomainType.PRIVATE)); - Assertions.assertEquals("xx", matcher.getDomainRoot("www.example.XX", DomainType.PRIVATE)); - Assertions.assertEquals("xx", matcher.getDomainRoot("www.blah.blah.example.XX", DomainType.PRIVATE)); - Assertions.assertEquals("appspot.com", matcher.getDomainRoot("example.appspot.com")); + Assertions.assertEquals("example.xx", matcher.getDomainRoot("example.XX", DomainType.PRIVATE)); + Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.example.XX", DomainType.PRIVATE)); + Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.blah.blah.example.XX", DomainType.PRIVATE)); + Assertions.assertEquals("example.appspot.com", matcher.getDomainRoot("example.appspot.com")); // Too short Assertions.assertNull(matcher.getDomainRoot("jp", DomainType.PRIVATE)); Assertions.assertNull(matcher.getDomainRoot("ac.jp", DomainType.PRIVATE)); @@ -120,16 +148,15 @@ void testGetDomainRootOnlyICANN() { Assertions.assertNull(matcher.getDomainRoot("*.garbage.garbage.garbage", DomainType.ICANN)); } - @Test void testMatch() { Assertions.assertTrue(matcher.matches(".jp")); Assertions.assertTrue(matcher.matches(".ac.jp")); Assertions.assertTrue(matcher.matches(".any.tokyo.jp")); + Assertions.assertTrue(matcher.matches(".xx")); + Assertions.assertTrue(matcher.matches(".appspot.com")); // exception Assertions.assertFalse(matcher.matches(".metro.tokyo.jp")); - Assertions.assertFalse(matcher.matches(".xx")); - Assertions.assertFalse(matcher.matches(".appspot.com")); } @Test @@ -140,4 +167,108 @@ void testMatchUnicode() { Assertions.assertTrue(matcher.matches(".xn--h-2fa.no")); } + private void checkPublicSuffix(final String input, final String expected) { + Assertions.assertEquals(expected, pslMatcher.getDomainRoot(input)); + } + + //see https://github.com/publicsuffix/list/blob/master/tests/test_psl.txt + @Test + void testGetDomainRootPublicSuffixList() { + // null input. + checkPublicSuffix(null, null); + // Mixed case. + checkPublicSuffix("COM", null); + checkPublicSuffix("example.COM", "example.com"); + checkPublicSuffix("WwW.example.COM", "example.com"); + // Leading dot. + checkPublicSuffix(".com", null); + checkPublicSuffix(".example", null); + checkPublicSuffix(".example.com", null); + checkPublicSuffix(".example.example", null); + // Unlisted TLD. + checkPublicSuffix("example", null); + checkPublicSuffix("example.example", "example.example"); + checkPublicSuffix("b.example.example", "example.example"); + checkPublicSuffix("a.b.example.example", "example.example"); + // Listed, but non-Internet, TLD. + //checkPublicSuffix("local", null); + //checkPublicSuffix("example.local", null); + //checkPublicSuffix("b.example.local", null); + //checkPublicSuffix("a.b.example.local", null); + // TLD with only 1 rule. + checkPublicSuffix("biz", null); + checkPublicSuffix("domain.biz", "domain.biz"); + checkPublicSuffix("b.domain.biz", "domain.biz"); + checkPublicSuffix("a.b.domain.biz", "domain.biz"); + // TLD with some 2-level rules. + checkPublicSuffix("com", null); + checkPublicSuffix("example.com", "example.com"); + checkPublicSuffix("b.example.com", "example.com"); + checkPublicSuffix("a.b.example.com", "example.com"); + checkPublicSuffix("uk.com", null); + checkPublicSuffix("example.uk.com", "example.uk.com"); + checkPublicSuffix("b.example.uk.com", "example.uk.com"); + checkPublicSuffix("a.b.example.uk.com", "example.uk.com"); + checkPublicSuffix("test.ac", "test.ac"); + // TLD with only 1 (wildcard) rule. + checkPublicSuffix("mm", null); + checkPublicSuffix("c.mm", null); + checkPublicSuffix("b.c.mm", "b.c.mm"); + checkPublicSuffix("a.b.c.mm", "b.c.mm"); + // More complex TLD. + checkPublicSuffix("jp", null); + checkPublicSuffix("test.jp", "test.jp"); + checkPublicSuffix("www.test.jp", "test.jp"); + checkPublicSuffix("ac.jp", null); + checkPublicSuffix("test.ac.jp", "test.ac.jp"); + checkPublicSuffix("www.test.ac.jp", "test.ac.jp"); + checkPublicSuffix("kyoto.jp", null); + checkPublicSuffix("test.kyoto.jp", "test.kyoto.jp"); + checkPublicSuffix("ide.kyoto.jp", null); + checkPublicSuffix("b.ide.kyoto.jp", "b.ide.kyoto.jp"); + checkPublicSuffix("a.b.ide.kyoto.jp", "b.ide.kyoto.jp"); + checkPublicSuffix("c.kobe.jp", null); + checkPublicSuffix("b.c.kobe.jp", "b.c.kobe.jp"); + checkPublicSuffix("a.b.c.kobe.jp", "b.c.kobe.jp"); + checkPublicSuffix("city.kobe.jp", "city.kobe.jp"); + checkPublicSuffix("www.city.kobe.jp", "city.kobe.jp"); + // TLD with a wildcard rule and exceptions. + checkPublicSuffix("ck", null); + checkPublicSuffix("test.ck", null); + checkPublicSuffix("b.test.ck", "b.test.ck"); + checkPublicSuffix("a.b.test.ck", "b.test.ck"); + checkPublicSuffix("www.ck", "www.ck"); + checkPublicSuffix("www.www.ck", "www.ck"); + // US K12. + checkPublicSuffix("us", null); + checkPublicSuffix("test.us", "test.us"); + checkPublicSuffix("www.test.us", "test.us"); + checkPublicSuffix("ak.us", null); + checkPublicSuffix("test.ak.us", "test.ak.us"); + checkPublicSuffix("www.test.ak.us", "test.ak.us"); + checkPublicSuffix("k12.ak.us", null); + checkPublicSuffix("test.k12.ak.us", "test.k12.ak.us"); + checkPublicSuffix("www.test.k12.ak.us", "test.k12.ak.us"); + // IDN labels. + checkPublicSuffix("食狮.com.cn", "食狮.com.cn"); + checkPublicSuffix("食狮.公司.cn", "食狮.公司.cn"); + checkPublicSuffix("www.食狮.公司.cn", "食狮.公司.cn"); + checkPublicSuffix("shishi.公司.cn", "shishi.公司.cn"); + checkPublicSuffix("公司.cn", null); + checkPublicSuffix("食狮.中国", "食狮.中国"); + checkPublicSuffix("www.食狮.中国", "食狮.中国"); + checkPublicSuffix("shishi.中国", "shishi.中国"); + checkPublicSuffix("中国", null); + // Same as above, but punycoded. + checkPublicSuffix("xn--85x722f.com.cn", "xn--85x722f.com.cn"); + checkPublicSuffix("xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn"); + checkPublicSuffix("www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn"); + checkPublicSuffix("shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn"); + checkPublicSuffix("xn--55qx5d.cn", null); + checkPublicSuffix("xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s"); + checkPublicSuffix("www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s"); + checkPublicSuffix("shishi.xn--fiqs8s", "shishi.xn--fiqs8s"); + checkPublicSuffix("xn--fiqs8s", null); + } + } diff --git a/httpclient5/src/test/java/org/apache/hc/client5/http/ssl/TestDefaultHostnameVerifier.java b/httpclient5/src/test/java/org/apache/hc/client5/http/ssl/TestDefaultHostnameVerifier.java index 7b6e2214e..b672127cf 100644 --- a/httpclient5/src/test/java/org/apache/hc/client5/http/ssl/TestDefaultHostnameVerifier.java +++ b/httpclient5/src/test/java/org/apache/hc/client5/http/ssl/TestDefaultHostnameVerifier.java @@ -276,11 +276,11 @@ void testIdentityMatching() { Assertions.assertTrue(DefaultHostnameVerifier.matchIdentity("a.b.xxx.uk", "a.b.xxx.uk", publicSuffixMatcher)); Assertions.assertTrue(DefaultHostnameVerifier.matchIdentityStrict("a.b.xxx.uk", "a.b.xxx.uk", publicSuffixMatcher)); - Assertions.assertTrue(DefaultHostnameVerifier.matchIdentity("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher)); - Assertions.assertTrue(DefaultHostnameVerifier.matchIdentityStrict("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher)); + Assertions.assertFalse(DefaultHostnameVerifier.matchIdentity("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher)); + Assertions.assertFalse(DefaultHostnameVerifier.matchIdentityStrict("a.b.xxx.uk", "*.b.xxx.uk", publicSuffixMatcher)); - Assertions.assertTrue(DefaultHostnameVerifier.matchIdentity("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher)); - Assertions.assertTrue(DefaultHostnameVerifier.matchIdentityStrict("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher)); + Assertions.assertFalse(DefaultHostnameVerifier.matchIdentity("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher)); + Assertions.assertFalse(DefaultHostnameVerifier.matchIdentityStrict("b.xxx.uk", "b.xxx.uk", publicSuffixMatcher)); Assertions.assertFalse(DefaultHostnameVerifier.matchIdentity("b.xxx.uk", "*.xxx.uk", publicSuffixMatcher)); Assertions.assertFalse(DefaultHostnameVerifier.matchIdentityStrict("b.xxx.uk", "*.xxx.uk", publicSuffixMatcher)); diff --git a/httpclient5/src/test/resources/suffixlistmatcher.txt b/httpclient5/src/test/resources/suffixlistmatcher.txt index 5f82ba9a7..decd10794 100644 --- a/httpclient5/src/test/resources/suffixlistmatcher.txt +++ b/httpclient5/src/test/resources/suffixlistmatcher.txt @@ -39,13 +39,14 @@ us-east-1.amazonaws.com // ===BEGIN ICANN DOMAINS=== jp -ac.jp +*.jp *.tokyo.jp !metro.tokyo.jp com co.jp gov.uk +*.foo.com // unicode no