From 53fd556c73e30ec4b19b0a00e41c486ff5d95bc4 Mon Sep 17 00:00:00 2001
From: dmsnell <dmsnell@git.wordpress.org>
Date: Tue, 30 Jan 2024 22:09:22 +0000
Subject: [PATCH] HTML API: Fix splitting single text node.

When `next_token()` was introduced, it brought a subtle bug. When encountering a `<` in the HTML stream which did not lead to a tag or comment or other token, it was treating the full text span to that point as one text node, and the following span another text node.

The entire span should be one text node.

In this patch the Tag Processor properly detects this scenario and combines the spans into one text node.

Follow-up to [57348]

Props jonsurrell
Fixes #60385


Built from https://develop.svn.wordpress.org/trunk@57489


git-svn-id: http://core.svn.wordpress.org/trunk@56990 1a063a9b-81f0-0310-95a4-ce76da25c4cd
---
 .../html-api/class-wp-html-tag-processor.php  | 41 ++++++++++++++-----
 wp-includes/version.php                       |  2 +-
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php
index 6f763acbd5..169fabe750 100644
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -1512,16 +1512,6 @@ class WP_HTML_Tag_Processor {
 		while ( false !== $at && $at < $doc_length ) {
 			$at = strpos( $html, '<', $at );
 
-			if ( $at > $was_at ) {
-				$this->parser_state         = self::STATE_TEXT_NODE;
-				$this->token_starts_at      = $was_at;
-				$this->token_length         = $at - $was_at;
-				$this->text_starts_at       = $was_at;
-				$this->text_length          = $this->token_length;
-				$this->bytes_already_parsed = $at;
-				return true;
-			}
-
 			/*
 			 * This does not imply an incomplete parse; it indicates that there
 			 * can be nothing left in the document other than a #text node.
@@ -1536,6 +1526,37 @@ class WP_HTML_Tag_Processor {
 				return true;
 			}
 
+			if ( $at > $was_at ) {
+				/*
+				 * A "<" has been found in the document. That may be the start of another node, or
+				 * it may be an "ivalid-first-character-of-tag-name" error. If this is not the start
+				 * of another node the "<" should be included in this text node and another
+				 * termination point should be found for the text node.
+				 *
+				 * @see https://html.spec.whatwg.org/#tag-open-state
+				 */
+				if ( strlen( $html ) > $at + 1 ) {
+					$next_character  = $html[ $at + 1 ];
+					$at_another_node =
+						'!' === $next_character ||
+						'/' === $next_character ||
+						'?' === $next_character ||
+						( 'A' <= $next_character && $next_character <= 'z' );
+					if ( ! $at_another_node ) {
+						++$at;
+						continue;
+					}
+				}
+
+				$this->parser_state         = self::STATE_TEXT_NODE;
+				$this->token_starts_at      = $was_at;
+				$this->token_length         = $at - $was_at;
+				$this->text_starts_at       = $was_at;
+				$this->text_length          = $this->token_length;
+				$this->bytes_already_parsed = $at;
+				return true;
+			}
+
 			$this->token_starts_at = $at;
 
 			if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {
diff --git a/wp-includes/version.php b/wp-includes/version.php
index aeed653d10..fcbf4f9f30 100644
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@@ -16,7 +16,7 @@
  *
  * @global string $wp_version
  */
-$wp_version = '6.5-alpha-57389';
+$wp_version = '6.5-alpha-57489';
 
 /**
  * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.