From 00dd23da6a5080725d55df71d89fd237fd46ce6c Mon Sep 17 00:00:00 2001 From: dmsnell Date: Fri, 20 Sep 2024 20:23:15 +0000 Subject: [PATCH] HTML API: Add `get_full_comment_text()` method. Previously, there were a few cases where the modifiable text read from an HTML comment differs slightly from the parsed value of its inner text in a browser. This is due to the specific way that invalid HTML syntax tokens become "bogus comments." This patch introduces a new method to the Tag Processor to allow differentiating these specific cases, such as when copying or serializing HTML from one source to another. Similar code has already been in use in the html5lib tests, and this patch simplifies the test runner, evidencing the fact that this method was already needed. Developed in https://github.com/wordpress/wordpress-develop/pull/7342 Discussed in https://core.trac.wordpress.org/ticket/62036 Props dmsnell, jonsurrell. See #62036. Built from https://develop.svn.wordpress.org/trunk@59075 git-svn-id: http://core.svn.wordpress.org/trunk@58471 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- .../html-api/class-wp-html-tag-processor.php | 52 +++++++++++++++++++ wp-includes/version.php | 2 +- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index 233d47eb8d..2b115dd156 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -3385,6 +3385,58 @@ class WP_HTML_Tag_Processor { return $this->comment_type; } + /** + * Returns the text of a matched comment or null if not on a comment type node. + * + * This method returns the entire text content of a comment node as it + * would appear in the browser. + * + * This differs from {@see ::get_modifiable_text()} in that certain comment + * types in the HTML API cannot allow their entire comment text content to + * be modified. Namely, "bogus comments" of the form `` + * will create a comment whose text content starts with `?`. Note that if + * that character were modified, it would be possible to change the node + * type. + * + * @since 6.7.0 + * + * @return string|null The comment text as it would appear in the browser or null + * if not on a comment type node. + */ + public function get_full_comment_text(): ?string { + if ( self::STATE_FUNKY_COMMENT === $this->parser_state ) { + return $this->get_modifiable_text(); + } + + if ( self::STATE_COMMENT !== $this->parser_state ) { + return null; + } + + switch ( $this->get_comment_type() ) { + case self::COMMENT_AS_HTML_COMMENT: + case self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT: + return $this->get_modifiable_text(); + + case self::COMMENT_AS_CDATA_LOOKALIKE: + return "[CDATA[{$this->get_modifiable_text()}]]"; + + case self::COMMENT_AS_PI_NODE_LOOKALIKE: + return "?{$this->get_tag()}{$this->get_modifiable_text()}?"; + + /* + * This represents "bogus comments state" from HTML tokenization. + * This can be entered by `html[ $this->text_starts_at - 1 ]; + $comment_start = '?' === $preceding_character ? '?' : ''; + return "{$comment_start}{$this->get_modifiable_text()}"; + } + + return null; + } + /** * Subdivides a matched text node, splitting NULL byte sequences and decoded whitespace as * distinct nodes prefixes. diff --git a/wp-includes/version.php b/wp-includes/version.php index 6ceac1f79f..fedbeb62b2 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.7-alpha-59074'; +$wp_version = '6.7-alpha-59075'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.