From 01d2199622d52b08d1704871770c68e35d2a80dc Mon Sep 17 00:00:00 2001 From: dmsnell Date: Fri, 23 Aug 2024 15:42:17 +0000 Subject: [PATCH] HTML API: Add support for missing FRAMESET and "after" insertion modes. As part of work to add more spec support to the HTML API, this patch adds support for the FRAMESET-related insertion modes, as well as the set of missing after insertion modes. These modes run at the end of parsing a document, closing it and taking care of any lingering tags. Developed in https://github.com/wordpress/wordpress-develop/7165 Discussed in https://core.trac.wordpress.org/ticket/61576 Props dmsnell, jonsurrell. See #61576. Built from https://develop.svn.wordpress.org/trunk@58926 git-svn-id: http://core.svn.wordpress.org/trunk@58322 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- .../html-api/class-wp-html-processor.php | 334 +++++++++++++++++- wp-includes/version.php | 2 +- 2 files changed, 329 insertions(+), 7 deletions(-) diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index ca7a4cf3e0..168b8ace4c 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -3972,7 +3972,71 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_after_body(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + goto after_body_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of BODY is unsupported.' ); + break; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "html" + * + * > If the parser was created as part of the HTML fragment parsing algorithm, + * > this is a parse error; ignore the token. (fragment case) + * > + * > Otherwise, switch the insertion mode to "after after body". + */ + case '-HTML': + if ( isset( $this->context_node ) ) { + return $this->step(); + } + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY; + return true; + } + + /* + * > Parse error. Switch the insertion mode to "in body" and reprocess the token. + */ + after_body_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -3991,7 +4055,109 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_in_frameset(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Insert the character. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + $text = $this->get_modifiable_text(); + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in frameset.' ); + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "frameset" + */ + case '+FRAMESET': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "frameset" + */ + case '-FRAMESET': + /* + * > If the current node is the root html element, then this is a parse error; + * > ignore the token. (fragment case) + */ + if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) { + return $this->step(); + } + + /* + * > Otherwise, pop the current node from the stack of open elements. + */ + $this->state->stack_of_open_elements->pop(); + + /* + * > If the parser was not created as part of the HTML fragment parsing algorithm + * > (fragment case), and the current node is no longer a frameset element, then + * > switch the insertion mode to "after frameset". + */ + if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) { + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET; + } + + return true; + + /* + * > A start tag whose tag name is "frame" + * + * > Insert an HTML element for the token. Immediately pop the + * > current node off the stack of open elements. + * > + * > Acknowledge the token's self-closing flag, if it is set. + */ + case '+FRAME': + $this->insert_html_element( $this->state->current_token ); + $this->state->stack_of_open_elements->pop(); + return true; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); } /** @@ -4010,7 +4176,67 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_after_frameset(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Insert the character. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in after frameset' ); + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "html" + */ + case '-HTML': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET; + return true; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); } /** @@ -4029,7 +4255,52 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_after_after_body(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of HTML is unsupported.' ); + break; + + /* + * > A DOCTYPE token + * > A start tag whose tag name is "html" + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case 'html': + case '+HTML': + return $this->step_in_body(); + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Process the token using the rules for the "in body" insertion mode. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + goto after_after_body_anything_else; + break; + } + + /* + * > Parse error. Switch the insertion mode to "in body" and reprocess the token. + */ + after_after_body_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -4048,7 +4319,57 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_after_after_frameset(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of HTML is unsupported.' ); + break; + + /* + * > A DOCTYPE token + * > A start tag whose tag name is "html" + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case 'html': + case '+HTML': + return $this->step_in_body(); + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Process the token using the rules for the "in body" insertion mode. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' ); + break; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); } /** @@ -4115,7 +4436,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { */ case '#cdata-section': case '#comment': - case '#funky_comment': + case '#funky-comment': + case '#presumptuous-tag': $this->insert_foreign_element( $this->state->current_token, false ); return true; diff --git a/wp-includes/version.php b/wp-includes/version.php index ef7e74496f..0338d92c6a 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.7-alpha-58925'; +$wp_version = '6.7-alpha-58926'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.