From 2ce04ceab9371d23d53c87f5a356cd5c509134fb Mon Sep 17 00:00:00 2001
From: dmsnell <dmsnell@git.wordpress.org>
Date: Tue, 3 Sep 2024 19:50:14 +0000
Subject: [PATCH] HTML API: Ensure that NULL and whitespace-only CDATA sections
 don't forbid FRAMESET.

When CDATA sections (which can only occur inside SVG and MathML content) consist only of NULL bytes or whitespace characters they should not clear the "frameset ok" flag. Previously they have always been clearing this flag, but in this patch the logic is updated to detect these sequences properly.

Developed in https://github.com/WordPress/wordpress-develop/pull/7230
Discussed in https://core.trac.wordpress.org/ticket/61576

Follow-up to [58867].

Props dmsnell, jonsurrell.
See #61576.

Built from https://develop.svn.wordpress.org/trunk@58977


git-svn-id: http://core.svn.wordpress.org/trunk@58373 1a063a9b-81f0-0310-95a4-ce76da25c4cd
---
 .../html-api/class-wp-html-processor.php      |  24 +++-
 .../html-api/class-wp-html-tag-processor.php  | 107 ++++++++----------
 wp-includes/version.php                       |   2 +-
 3 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php
index d924c9be5d..661b9c712a 100644
--- a/wp-includes/html-api/class-wp-html-processor.php
+++ b/wp-includes/html-api/class-wp-html-processor.php
@@ -843,10 +843,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 
 		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
 			parent::next_token();
-			if (
-				WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
-				WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
-			) {
+			if ( WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ) {
 				parent::subdivide_text_appropriately();
 			}
 		}
@@ -4375,7 +4372,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		}
 
 		switch ( $op ) {
-			case '#cdata-section':
 			case '#text':
 				/*
 				 * > A character token that is U+0000 NULL
@@ -4395,6 +4391,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				$this->insert_foreign_element( $this->state->current_token, false );
 				return true;
 
+			/*
+			 * CDATA sections are alternate wrappers for text content and therefore
+			 * ought to follow the same rules as text nodes.
+			 */
+			case '#cdata-section':
+				/*
+				 * NULL bytes and whitespace do not change the frameset-ok flag.
+				 */
+				$current_token        = $this->bookmarks[ $this->state->current_token->bookmark_name ];
+				$cdata_content_start  = $current_token->start + 9;
+				$cdata_content_length = $current_token->length - 12;
+				if ( strspn( $this->html, "\0 \t\n\f\r", $cdata_content_start, $cdata_content_length ) !== $cdata_content_length ) {
+					$this->state->frameset_ok = false;
+				}
+
+				$this->insert_foreign_element( $this->state->current_token, false );
+				return true;
+
 			/*
 			 * > A comment token
 			 */
diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php
index e4397b2644..e8572935a6 100644
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -3337,8 +3337,8 @@ class WP_HTML_Tag_Processor {
 	}
 
 	/**
-	 * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences
-	 * and decoded whitespace as distinct prefixes.
+	 * Subdivides a matched text node, splitting NULL byte sequences and decoded whitespace as
+	 * distinct nodes prefixes.
 	 *
 	 * Note that once anything that's neither a NULL byte nor decoded whitespace is
 	 * encountered, then the remainder of the text node is left intact as generic text.
@@ -3368,70 +3368,55 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether the text node was subdivided.
 	 */
 	public function subdivide_text_appropriately(): bool {
-		$this->text_node_classification = self::TEXT_IS_GENERIC;
-
-		if ( self::STATE_TEXT_NODE === $this->parser_state ) {
-			/*
-			 * NULL bytes are treated categorically different than numeric character
-			 * references whose number is zero. `&#x00;` is not the same as `"\x00"`.
-			 */
-			$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
-			if ( $leading_nulls > 0 ) {
-				$this->token_length             = $leading_nulls;
-				$this->text_length              = $leading_nulls;
-				$this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
-				$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
-				return true;
-			}
-
-			/*
-			 * Start a decoding loop to determine the point at which the
-			 * text subdivides. This entails raw whitespace bytes and any
-			 * character reference that decodes to the same.
-			 */
-			$at  = $this->text_starts_at;
-			$end = $this->text_starts_at + $this->text_length;
-			while ( $at < $end ) {
-				$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
-				$at     += $skipped;
-
-				if ( $at < $end && '&' === $this->html[ $at ] ) {
-					$matched_byte_length = null;
-					$replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
-					if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
-						$at += $matched_byte_length;
-						continue;
-					}
-				}
-
-				break;
-			}
-
-			if ( $at > $this->text_starts_at ) {
-				$new_length                     = $at - $this->text_starts_at;
-				$this->text_length              = $new_length;
-				$this->token_length             = $new_length;
-				$this->bytes_already_parsed     = $at;
-				$this->text_node_classification = self::TEXT_IS_WHITESPACE;
-				return true;
-			}
-
+		if ( self::STATE_TEXT_NODE !== $this->parser_state ) {
 			return false;
 		}
 
-		// Unlike text nodes, there are no character references within CDATA sections.
-		if ( self::STATE_CDATA_NODE === $this->parser_state ) {
-			$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
-			if ( $leading_nulls === $this->text_length ) {
-				$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
-				return true;
+		$this->text_node_classification = self::TEXT_IS_GENERIC;
+
+		/*
+		 * NULL bytes are treated categorically different than numeric character
+		 * references whose number is zero. `&#x00;` is not the same as `"\x00"`.
+		 */
+		$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
+		if ( $leading_nulls > 0 ) {
+			$this->token_length             = $leading_nulls;
+			$this->text_length              = $leading_nulls;
+			$this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
+			$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
+			return true;
+		}
+
+		/*
+		 * Start a decoding loop to determine the point at which the
+		 * text subdivides. This entails raw whitespace bytes and any
+		 * character reference that decodes to the same.
+		 */
+		$at  = $this->text_starts_at;
+		$end = $this->text_starts_at + $this->text_length;
+		while ( $at < $end ) {
+			$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
+			$at     += $skipped;
+
+			if ( $at < $end && '&' === $this->html[ $at ] ) {
+				$matched_byte_length = null;
+				$replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
+				if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
+					$at += $matched_byte_length;
+					continue;
+				}
 			}
 
-			$leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length );
-			if ( $leading_ws === $this->text_length ) {
-				$this->text_node_classification = self::TEXT_IS_WHITESPACE;
-				return true;
-			}
+			break;
+		}
+
+		if ( $at > $this->text_starts_at ) {
+			$new_length                     = $at - $this->text_starts_at;
+			$this->text_length              = $new_length;
+			$this->token_length             = $new_length;
+			$this->bytes_already_parsed     = $at;
+			$this->text_node_classification = self::TEXT_IS_WHITESPACE;
+			return true;
 		}
 
 		return false;
diff --git a/wp-includes/version.php b/wp-includes/version.php
index 0c86a35335..807bfa3b4d 100644
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@@ -16,7 +16,7 @@
  *
  * @global string $wp_version
  */
-$wp_version = '6.7-alpha-58976';
+$wp_version = '6.7-alpha-58977';
 
 /**
  * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.