HTML API: Introduce full parsing mode in HTML Processor.

The HTML Processor has only supported a specific kind of parsing mode called _the fragment parsing mode_, where it behaves in the same way that `node.innerHTML = html` does in the DOM. This mode assumes a context node and doesn't support parsing an entire document. As part of work to add more spec support to the HTML API, this patch introduces a full parsing mode, which can parse a full HTML document from start to end, including the doctype declaration and head tags. Developed in https://github.com/wordpress/wordpress-develop/pull/6977 Discussed in https://core.trac.wordpress.org/ticket/61576 Props: dmsnell, jonsurrell. See #61576. Built from https://develop.svn.wordpress.org/trunk@58836 git-svn-id: http://core.svn.wordpress.org/trunk@58232 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2024-07-31 16:56:15 +00:00 · 2024-07-31 16:56:15 +00:00 · af6e4904af
parent 2602d78021
commit af6e4904af
3 changed files with 577 additions and 52 deletions
--- a/wp-includes/html-api/class-wp-html-processor-state.php
+++ b/wp-includes/html-api/class-wp-html-processor-state.php
@ -428,6 +428,38 @@ class WP_HTML_Processor_State {
 	 */
 	public $context_node = null;

+	/**
+	 * The recognized encoding of the input byte stream.
+	 *
+	 * > The stream of code points that comprises the input to the tokenization
+	 * > stage will be initially seen by the user agent as a stream of bytes
+	 * > (typically coming over the network or from the local file system).
+	 * > The bytes encode the actual characters according to a particular character
+	 * > encoding, which the user agent uses to decode the bytes into characters.
+	 *
+	 * @since 6.7.0
+	 *
+	 * @var string|null
+	 */
+	public $encoding = null;
+
+	/**
+	 * The parser's confidence in the input encoding.
+	 *
+	 * > When the HTML parser is decoding an input byte stream, it uses a character
+	 * > encoding and a confidence. The confidence is either tentative, certain, or
+	 * > irrelevant. The encoding used, and whether the confidence in that encoding
+	 * > is tentative or certain, is used during the parsing to determine whether to
+	 * > change the encoding. If no encoding is necessary, e.g. because the parser is
+	 * > operating on a Unicode stream and doesn't have to use a character encoding
+	 * > at all, then the confidence is irrelevant.
+	 *
+	 * @since 6.7.0
+	 *
+	 * @var string
+	 */
+	public $encoding_confidence = 'tentative';
+
 	/**
 	 * HEAD element pointer.
 	 *
--- a/wp-includes/html-api/class-wp-html-processor.php
+++ b/wp-includes/html-api/class-wp-html-processor.php
@ -256,21 +256,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 */
 	private $context_node = null;

-	/**
-	 * Whether the parser has yet processed the context node,
-	 * if created as a fragment parser.
-	 *
-	 * The context node will be initially pushed onto the stack of open elements,
-	 * but when created as a fragment parser, this context element (and the implicit
-	 * HTML document node above it) should not be exposed as a matched token or node.
-	 *
-	 * This boolean indicates whether the processor should skip over the current
-	 * node in its initial search for the first node created from the input HTML.
-	 *
-	 * @var bool
-	 */
-	private $has_seen_context_node = false;
-
 	/*
 	 * Public Interface Functions
 	 */
@ -315,6 +300,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		$processor                             = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
 		$processor->state->context_node        = array( 'BODY', array() );
 		$processor->state->insertion_mode      = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+		$processor->state->encoding            = $encoding;
+		$processor->state->encoding_confidence = 'certain';

 		// @todo Create "fake" bookmarks for non-existent but implied nodes.
 		$processor->bookmarks['root-node']    = new WP_HTML_Span( 0, 0 );
@ -340,6 +327,34 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		return $processor;
 	}

+	/**
+	 * Creates an HTML processor in the full parsing mode.
+	 *
+	 * It's likely that a fragment parser is more appropriate, unless sending an
+	 * entire HTML document from start to finish. Consider a fragment parser with
+	 * a context node of `<body>`.
+	 *
+	 * Since UTF-8 is the only currently-accepted charset, if working with a
+	 * document that isn't UTF-8, it's important to convert the document before
+	 * creating the processor: pass in the converted HTML.
+	 *
+	 * @param string      $html                    Input HTML document to process.
+	 * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used
+	 *                                             in the input byte stream. Currently must be UTF-8.
+	 * @return static|null The created processor if successful, otherwise null.
+	 */
+	public static function create_full_parser( $html, $known_definite_encoding = 'UTF-8' ) {
+		if ( 'UTF-8' !== $known_definite_encoding ) {
+			return null;
+		}
+
+		$processor                             = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
+		$processor->state->encoding            = $known_definite_encoding;
+		$processor->state->encoding_confidence = 'certain';
+
+		return $processor;
+	}
+
 	/**
 	 * Constructor.
 	 *
@ -993,7 +1008,62 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_initial(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_INITIAL . ' state.' );
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION,
+			 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
+			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 *
+			 * Parse error: ignore the token.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step();
+				}
+				goto initial_anything_else;
+				break;
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				$contents = $this->get_modifiable_text();
+				if ( ' html' !== $contents ) {
+					/*
+					 * @todo When the HTML Tag Processor fully parses the DOCTYPE declaration,
+					 *       this code should examine the contents to set the compatability mode.
+					 */
+					$this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' );
+				}
+
+				/*
+				 * > Then, switch the insertion mode to "before html".
+				 */
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
+				return true;
+		}
+
+		/*
+		 * > Anything else
+		 */
+		initial_anything_else:
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}

 	/**
@ -1002,7 +1072,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * This internal function performs the 'before html' insertion mode
 	 * logic for the generalized WP_HTML_Processor::step() function.
 	 *
-	 * @since 6.7.0 Stub implementation.
+	 * @since 6.7.0
 	 *
 	 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
 	 *
@ -1012,7 +1082,86 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_before_html(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML . ' state.' );
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$is_closer  = parent::is_tag_closer();
+		$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION,
+			 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
+			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 *
+			 * Parse error: ignore the token.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step();
+				}
+				goto before_html_anything_else;
+				break;
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD;
+				return true;
+
+			/*
+			 * > An end tag whose tag name is one of: "head", "body", "html", "br"
+			 *
+			 * Closing BR tags are always reported by the Tag Processor as opening tags.
+			 */
+			case '-HEAD':
+			case '-BODY':
+			case '-HTML':
+				/*
+				 * > Act as described in the "anything else" entry below.
+				 */
+				goto before_html_anything_else;
+				break;
+		}
+
+		/*
+		 * > Any other end tag
+		 */
+		if ( $is_closer ) {
+			// Parse error: ignore the token.
+			return $this->step();
+		}
+
+		/*
+		 * > Anything else.
+		 *
+		 * > Create an html element whose node document is the Document object.
+		 * > Append it to the Document object. Put this element in the stack of open elements.
+		 * > Switch the insertion mode to "before head", then reprocess the token.
+		 */
+		before_html_anything_else:
+		$this->insert_virtual_node( 'HTML' );
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}

 	/**
@ -1031,7 +1180,86 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_before_head(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD . ' state.' );
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$is_closer  = parent::is_tag_closer();
+		$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION,
+			 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
+			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 *
+			 * Parse error: ignore the token.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step();
+				}
+				goto before_head_anything_else;
+				break;
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > A start tag whose tag name is "head"
+			 */
+			case '+HEAD':
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->head_element   = $this->state->current_token;
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD;
+				return true;
+
+			/*
+			 * > An end tag whose tag name is one of: "head", "body", "html", "br"
+			 * > Act as described in the "anything else" entry below.
+			 *
+			 * Closing BR tags are always reported by the Tag Processor as opening tags.
+			 */
+			case '-HEAD':
+			case '-BODY':
+			case '-HTML':
+				goto before_head_anything_else;
+				break;
+		}
+
+		if ( $is_closer ) {
+			// Parse error: ignore the token.
+			return $this->step();
+		}
+
+		/*
+		 * > Anything else
+		 *
+		 * > Insert an HTML element for a "head" start tag token with no attributes.
+		 */
+		before_head_anything_else:
+		$this->state->head_element   = $this->insert_virtual_node( 'HEAD' );
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}

 	/**
@ -1056,12 +1284,13 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
 		$op         = "{$op_sigil}{$token_name}";

+		switch ( $op ) {
+			case '#text':
 				/*
 				 * > A character token that is one of U+0009 CHARACTER TABULATION,
 				 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
 				 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 				 */
-		if ( '#text' === $op ) {
 				$text = $this->get_modifiable_text();
 				if ( '' === $text ) {
 					/*
@ -1076,9 +1305,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 					$this->insert_html_element( $this->state->current_token );
 					return true;
 				}
-		}

-		switch ( $op ) {
+				goto in_head_anything_else;
+				break;
+
 			/*
 			 * > A comment token
 			 */
@ -1124,7 +1354,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				 * >     tentative, then change the encoding to the resulting encoding.
 				 */
 				$charset = $this->get_attribute( 'charset' );
-				if ( is_string( $charset ) ) {
+				if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) {
 					$this->bail( 'Cannot yet process META tags with charset to determine encoding.' );
 				}

@ -1141,7 +1371,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				if (
 					is_string( $http_equiv ) &&
 					is_string( $content ) &&
-					0 === strcasecmp( $http_equiv, 'Content-Type' )
+					0 === strcasecmp( $http_equiv, 'Content-Type' ) &&
+					'tentative' === $this->state->encoding_confidence
 				) {
 					$this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' );
 				}
@ -1193,10 +1424,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {

 			/*
 			 * > An end tag whose tag name is one of: "body", "html", "br"
+			 *
+			 * BR tags are always reported by the Tag Processor as opening tags.
 			 */
 			case '-BODY':
 			case '-HTML':
-			case '-BR':
 				/*
 				 * > Act as described in the "anything else" entry below.
 				 */
@ -1273,7 +1505,92 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_in_head_noscript(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT . ' state.' );
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$is_closer  = parent::is_tag_closer();
+		$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION,
+			 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
+			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 *
+			 * Parse error: ignore the token.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step_in_head();
+				}
+
+				goto in_head_noscript_anything_else;
+				break;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > An end tag whose tag name is "noscript"
+			 */
+			case '-NOSCRIPT':
+				$this->state->stack_of_open_elements->pop();
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD;
+				return true;
+
+			/*
+			 * > A comment token
+			 * >
+			 * > A start tag whose tag name is one of: "basefont", "bgsound",
+			 * > "link", "meta", "noframes", "style"
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+			case '+BASEFONT':
+			case '+BGSOUND':
+			case '+LINK':
+			case '+META':
+			case '+NOFRAMES':
+			case '+STYLE':
+				return $this->step_in_head();
+
+			/*
+			 * > An end tag whose tag name is "br"
+			 *
+			 * This should never happen, as the Tag Processor prevents showing a BR closing tag.
+			 */
+		}
+
+		/*
+		 * > A start tag whose tag name is one of: "head", "noscript"
+		 * > Any other end tag
+		 */
+		if ( '+HEAD' === $op || '+NOSCRIPT' === $op || $is_closer ) {
+			// Parse error: ignore the token.
+			return $this->step();
+		}
+
+		/*
+		 * > Anything else
+		 *
+		 * Anything here is a parse error.
+		 */
+		in_head_noscript_anything_else:
+		$this->state->stack_of_open_elements->pop();
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}

 	/**
@ -1292,7 +1609,133 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_after_head(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD . ' state.' );
+		$token_name = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$is_closer  = parent::is_tag_closer();
+		$op_sigil   = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$token_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION,
+			 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
+			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					// Insert the character.
+					$this->insert_html_element( $this->state->current_token );
+					return true;
+				}
+				goto after_head_anything_else;
+				break;
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > A start tag whose tag name is "body"
+			 */
+			case '+BODY':
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok    = false;
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "frameset"
+			 */
+			case '+FRAMESET':
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET;
+				return true;
+
+			/*
+			 * > A start tag whose tag name is one of: "base", "basefont", "bgsound",
+			 * > "link", "meta", "noframes", "script", "style", "template", "title"
+			 *
+			 * Anything here is a parse error.
+			 */
+			case '+BASE':
+			case '+BASEFONT':
+			case '+BGSOUND':
+			case '+LINK':
+			case '+META':
+			case '+NOFRAMES':
+			case '+SCRIPT':
+			case '+STYLE':
+			case '+TEMPLATE':
+			case '+TITLE':
+				/*
+				 * > Push the node pointed to by the head element pointer onto the stack of open elements.
+				 * > Process the token using the rules for the "in head" insertion mode.
+				 * > Remove the node pointed to by the head element pointer from the stack of open elements. (It might not be the current node at this point.)
+				 */
+				$this->bail( 'Cannot process elements after HEAD which reopen the HEAD element.' );
+				/*
+				 * Do not leave this break in when adding support; it's here to prevent
+				 * WPCS from getting confused at the switch structure without a return,
+				 * because it doesn't know that `bail()` always throws.
+				 */
+				break;
+
+			/*
+			 * > An end tag whose tag name is "template"
+			 */
+			case '-TEMPLATE':
+				return $this->step_in_head();
+
+			/*
+			 * > An end tag whose tag name is one of: "body", "html", "br"
+			 *
+			 * Closing BR tags are always reported by the Tag Processor as opening tags.
+			 */
+			case '-BODY':
+			case '-HTML':
+				/*
+				 * > Act as described in the "anything else" entry below.
+				 */
+				goto after_head_anything_else;
+				break;
+		}
+
+		/*
+		 * > A start tag whose tag name is "head"
+		 * > Any other end tag
+		 */
+		if ( '+HEAD' === $op || $is_closer ) {
+			// Parse error: ignore the token.
+			return $this->step();
+		}
+
+		/*
+		 * > Anything else
+		 * > Insert an HTML element for a "body" start tag token with no attributes.
+		 */
+		after_head_anything_else:
+		$this->insert_virtual_node( 'BODY' );
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}

 	/**
@ -4469,14 +4912,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @param string      $token_name    Name of token to create and insert into the stack of open elements.
 	 * @param string|null $bookmark_name Optional. Name to give bookmark for created virtual node.
 	 *                                   Defaults to auto-creating a bookmark name.
+	 * @return WP_HTML_Token Newly-created virtual token.
 	 */
-	private function insert_virtual_node( $token_name, $bookmark_name = null ): void {
+	private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_HTML_Token {
 		$here = $this->bookmarks[ $this->state->current_token->bookmark_name ];
 		$name = $bookmark_name ?? $this->bookmark_token();

 		$this->bookmarks[ $name ] = new WP_HTML_Span( $here->start, 0 );

-		$this->insert_html_element( new WP_HTML_Token( $name, $token_name, false ) );
+		$token = new WP_HTML_Token( $name, $token_name, false );
+		$this->insert_html_element( $token );
+		return $token;
 	}

 	/*
@ -4633,6 +5079,53 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		);
 	}

+	/**
+	 * Gets an encoding from a given string.
+	 *
+	 * This is an algorithm defined in the WHAT-WG specification.
+	 *
+	 * Example:
+	 *
+	 *     'UTF-8' === self::get_encoding( 'utf8' );
+	 *     'UTF-8' === self::get_encoding( "  \tUTF-8 " );
+	 *     null    === self::get_encoding( 'UTF-7' );
+	 *     null    === self::get_encoding( 'utf8; charset=' );
+	 *
+	 * @see https://encoding.spec.whatwg.org/#concept-encoding-get
+	 *
+	 * @todo As this parser only supports UTF-8, only the UTF-8
+	 *       encodings are detected. Add more as desired, but the
+	 *       parser will bail on non-UTF-8 encodings.
+	 *
+	 * @since 6.7.0
+	 *
+	 * @param string $label A string which may specify a known encoding.
+	 * @return string|null Known encoding if matched, otherwise null.
+	 */
+	protected static function get_encoding( string $label ): ?string {
+		/*
+		 * > Remove any leading and trailing ASCII whitespace from label.
+		 */
+		$label = trim( $label, " \t\f\r\n" );
+
+		/*
+		 * > If label is an ASCII case-insensitive match for any of the labels listed in the
+		 * > table below, then return the corresponding encoding; otherwise return failure.
+		 */
+		switch ( strtolower( $label ) ) {
+			case 'unicode-1-1-utf-8':
+			case 'unicode11utf8':
+			case 'unicode20utf8':
+			case 'utf-8':
+			case 'utf8':
+			case 'x-unicode20utf8':
+				return 'UTF-8';
+
+			default:
+				return null;
+		}
+	}
+
 	/*
 	 * Constants that would pollute the top of the class if they were found there.
 	 */
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -16,7 +16,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '6.7-alpha-58835';
+$wp_version = '6.7-alpha-58836';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.