From 1ffe7a456ca939aeee7d1f3bae890c4a4482a01a Mon Sep 17 00:00:00 2001 From: dmsnell Date: Thu, 8 Aug 2024 07:25:15 +0000 Subject: [PATCH] HTML API: Add support for SVG and MathML (Foreign content) As part of work to add more spec support to the HTML API, this patch adds support for SVG and MathML elements, or more generally, "foreign content." The rules in foreign content are a mix of XML and HTML parsing rules and introduce additional complexity into the processor, but is important in order to avoid getting lost when inside these elements. Developed in https://github.com/wordpress/wordpress-develop/pull/6006 Discussed in https://core.trac.wordpress.org/ticket/61576 Props: dmsnell, jonsurrell, westonruter. See #61576. Built from https://develop.svn.wordpress.org/trunk@58867 git-svn-id: http://core.svn.wordpress.org/trunk@58263 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- .../html-api/class-wp-html-open-elements.php | 113 ++-- .../class-wp-html-processor-state.php | 12 - .../html-api/class-wp-html-processor.php | 598 ++++++++++++++++-- .../html-api/class-wp-html-tag-processor.php | 464 +++++++++++++- wp-includes/html-api/class-wp-html-token.php | 19 + wp-includes/version.php | 2 +- 6 files changed, 1078 insertions(+), 130 deletions(-) diff --git a/wp-includes/html-api/class-wp-html-open-elements.php b/wp-includes/html-api/class-wp-html-open-elements.php index c760009ce0..5ce1f8feb5 100644 --- a/wp-includes/html-api/class-wp-html-open-elements.php +++ b/wp-includes/html-api/class-wp-html-open-elements.php @@ -113,13 +113,13 @@ class WP_HTML_Open_Elements { * * @param int $nth Retrieve the nth item on the stack, with 1 being * the top element, 2 being the second, etc... - * @return string|null Name of the node on the stack at the given location, - * or `null` if the location isn't on the stack. + * @return WP_HTML_Token|null Name of the node on the stack at the given location, + * or `null` if the location isn't on the stack. */ - public function at( int $nth ): ?string { + public function at( int $nth ): ?WP_HTML_Token { foreach ( $this->walk_down() as $item ) { if ( 0 === --$nth ) { - return $item->node_name; + return $item; } } @@ -242,18 +242,22 @@ class WP_HTML_Open_Elements { */ public function has_element_in_specific_scope( string $tag_name, $termination_list ): bool { foreach ( $this->walk_up() as $node ) { - if ( $node->node_name === $tag_name ) { + $namespaced_name = 'html' === $node->namespace + ? $node->node_name + : "{$node->namespace} {$node->node_name}"; + + if ( $namespaced_name === $tag_name ) { return true; } if ( '(internal: H1 through H6 - do not use)' === $tag_name && - in_array( $node->node_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) + in_array( $namespaced_name, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ) ) { return true; } - if ( in_array( $node->node_name, $termination_list, true ) ) { + if ( in_array( $namespaced_name, $termination_list, true ) ) { return false; } } @@ -288,7 +292,7 @@ class WP_HTML_Open_Elements { * > - SVG title * * @since 6.4.0 - * @since 6.7.0 Supports all required HTML elements. + * @since 6.7.0 Full support. * * @see https://html.spec.whatwg.org/#has-an-element-in-scope * @@ -309,19 +313,16 @@ class WP_HTML_Open_Elements { 'OBJECT', 'TEMPLATE', - /* - * @todo Support SVG and MathML nodes when support for foreign content is added. - * - * - MathML mi - * - MathML mo - * - MathML mn - * - MathML ms - * - MathML mtext - * - MathML annotation-xml - * - SVG foreignObject - * - SVG desc - * - SVG title - */ + 'math MI', + 'math MO', + 'math MN', + 'math MS', + 'math MTEXT', + 'math ANNOTATION-XML', + + 'svg FOREIGNOBJECT', + 'svg DESC', + 'svg TITLE', ) ); } @@ -363,19 +364,16 @@ class WP_HTML_Open_Elements { 'TEMPLATE', 'UL', - /* - * @todo Support SVG and MathML nodes when support for foreign content is added. - * - * - MathML mi - * - MathML mo - * - MathML mn - * - MathML ms - * - MathML mtext - * - MathML annotation-xml - * - SVG foreignObject - * - SVG desc - * - SVG title - */ + 'math MI', + 'math MO', + 'math MN', + 'math MS', + 'math MTEXT', + 'math ANNOTATION-XML', + + 'svg FOREIGNOBJECT', + 'svg DESC', + 'svg TITLE', ) ); } @@ -413,19 +411,16 @@ class WP_HTML_Open_Elements { 'OBJECT', 'TEMPLATE', - /* - * @todo Support SVG and MathML nodes when support for foreign content is added. - * - * - MathML mi - * - MathML mo - * - MathML mn - * - MathML ms - * - MathML mtext - * - MathML annotation-xml - * - SVG foreignObject - * - SVG desc - * - SVG title - */ + 'math MI', + 'math MO', + 'math MN', + 'math MS', + 'math MTEXT', + 'math ANNOTATION-XML', + + 'svg FOREIGNOBJECT', + 'svg DESC', + 'svg TITLE', ) ); } @@ -692,11 +687,15 @@ class WP_HTML_Open_Elements { * @param WP_HTML_Token $item Element that was added to the stack of open elements. */ public function after_element_push( WP_HTML_Token $item ): void { + $namespaced_name = 'html' === $item->namespace + ? $item->node_name + : "{$item->namespace} {$item->node_name}"; + /* * When adding support for new elements, expand this switch to trap * cases where the precalculated value needs to change. */ - switch ( $item->node_name ) { + switch ( $namespaced_name ) { case 'APPLET': case 'BUTTON': case 'CAPTION': @@ -707,6 +706,15 @@ class WP_HTML_Open_Elements { case 'MARQUEE': case 'OBJECT': case 'TEMPLATE': + case 'math MI': + case 'math MO': + case 'math MN': + case 'math MS': + case 'math MTEXT': + case 'math ANNOTATION-XML': + case 'svg FOREIGNOBJECT': + case 'svg DESC': + case 'svg TITLE': $this->has_p_in_button_scope = false; break; @@ -750,6 +758,15 @@ class WP_HTML_Open_Elements { case 'MARQUEE': case 'OBJECT': case 'TEMPLATE': + case 'math MI': + case 'math MO': + case 'math MN': + case 'math MS': + case 'math MTEXT': + case 'math ANNOTATION-XML': + case 'svg FOREIGNOBJECT': + case 'svg DESC': + case 'svg TITLE': $this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' ); break; } diff --git a/wp-includes/html-api/class-wp-html-processor-state.php b/wp-includes/html-api/class-wp-html-processor-state.php index 97f6da95a0..16875c4ac1 100644 --- a/wp-includes/html-api/class-wp-html-processor-state.php +++ b/wp-includes/html-api/class-wp-html-processor-state.php @@ -299,18 +299,6 @@ class WP_HTML_Processor_State { */ const INSERTION_MODE_AFTER_AFTER_FRAMESET = 'insertion-mode-after-after-frameset'; - /** - * In foreign content insertion mode for full HTML parser. - * - * @since 6.7.0 - * - * @see https://html.spec.whatwg.org/#parsing-main-inforeign - * @see WP_HTML_Processor_State::$insertion_mode - * - * @var string - */ - const INSERTION_MODE_IN_FOREIGN_CONTENT = 'insertion-mode-in-foreign-content'; - /** * No-quirks mode document compatability mode. * diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index 39ba43e467..3820fe0277 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -307,14 +307,14 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); - $processor->state->stack_of_open_elements->push( - new WP_HTML_Token( - 'root-node', - 'HTML', - false - ) + $root_node = new WP_HTML_Token( + 'root-node', + 'HTML', + false ); + $processor->state->stack_of_open_elements->push( $root_node ); + $context_node = new WP_HTML_Token( 'context-node', $processor->state->context_node[0], @@ -392,6 +392,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance ); + + $this->change_parsing_namespace( $token->namespace ); } ); @@ -401,6 +403,12 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance ); + $adjusted_current_node = $this->get_adjusted_current_node(); + $this->change_parsing_namespace( + $adjusted_current_node + ? $adjusted_current_node->namespace + : 'html' + ); } ); @@ -767,19 +775,20 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * foreign content will also act just like a void tag, immediately * closing as soon as the processor advances to the next token. * - * @since 6.6.0 + * @todo Review the self-closing logic when no node is present, ensure it + * matches the expectations in `step()`. * - * @todo When adding support for foreign content, ensure that - * this returns false for self-closing elements in the - * SVG and MathML namespace. + * @since 6.6.0 * * @param WP_HTML_Token|null $node Optional. Node to examine, if provided. * Default is to examine current node. * @return bool|null Whether to expect a closer for the currently-matched node, * or `null` if not matched on any token. */ - public function expects_closer( $node = null ): ?bool { - $token_name = $node->node_name ?? $this->get_token_name(); + public function expects_closer( WP_HTML_Token $node = null ): ?bool { + $token_name = $node->node_name ?? $this->get_token_name(); + $token_namespace = $node->namespace ?? $this->get_namespace(); + if ( ! isset( $token_name ) ) { return null; } @@ -792,7 +801,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { // Void elements. self::is_void( $token_name ) || // Special atomic elements. - in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) + ( 'html' === $token_namespace && in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) || + // Self-closing elements in foreign content. + ( isset( $node ) && 'html' !== $node->namespace && $node->has_self_closing_flag ) ); } @@ -824,14 +835,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * When moving on to the next node, therefore, if the bottom-most element * on the stack is a void element, it must be closed. - * - * @todo Once self-closing foreign elements and BGSOUND are supported, - * they must also be implicitly closed here too. BGSOUND is - * special since it's only self-closing if the self-closing flag - * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( isset( $top_node ) && ! static::expects_closer( $top_node ) ) { + if ( isset( $top_node ) && ! $this->expects_closer( $top_node ) ) { $this->state->stack_of_open_elements->pop(); } } @@ -848,14 +854,46 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { return false; } - $this->state->current_token = new WP_HTML_Token( - $this->bookmark_token(), - $this->get_token_name(), - $this->has_self_closing_flag(), - $this->release_internal_bookmark_on_destruct + $adjusted_current_node = $this->get_adjusted_current_node(); + $is_closer = $this->is_tag_closer(); + $is_start_tag = WP_HTML_Tag_Processor::STATE_MATCHED_TAG === $this->parser_state && ! $is_closer; + $token_name = $this->get_token_name(); + + if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) { + $this->state->current_token = new WP_HTML_Token( + $this->bookmark_token(), + $token_name, + $this->has_self_closing_flag(), + $this->release_internal_bookmark_on_destruct + ); + } + + $parse_in_current_insertion_mode = ( + 0 === $this->state->stack_of_open_elements->count() || + 'html' === $adjusted_current_node->namespace || + ( + 'math' === $adjusted_current_node->integration_node_type && + ( + ( $is_start_tag && ! in_array( $token_name, array( 'MGLYPH', 'MALIGNMARK' ), true ) ) || + '#text' === $token_name + ) + ) || + ( + 'math' === $adjusted_current_node->namespace && + 'ANNOTATION-XML' === $adjusted_current_node->node_name && + $is_start_tag && 'SVG' === $token_name + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + ( $is_start_tag || '#text' === $token_name ) + ) ); try { + if ( ! $parse_in_current_insertion_mode ) { + return $this->step_in_foreign_content(); + } + switch ( $this->state->insertion_mode ) { case WP_HTML_Processor_State::INSERTION_MODE_INITIAL: return $this->step_initial(); @@ -923,9 +961,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET: return $this->step_after_after_frameset(); - case WP_HTML_Processor_State::INSERTION_MODE_IN_FOREIGN_CONTENT: - return $this->step_in_foreign_content(); - // This should be unreachable but PHP doesn't have total type checking on switch. default: $this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." ); @@ -1853,7 +1888,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { case '+BODY': if ( 1 === $this->state->stack_of_open_elements->count() || - 'BODY' !== $this->state->stack_of_open_elements->at( 2 ) || + 'BODY' !== ( $this->state->stack_of_open_elements->at( 2 )->node_name ?? null ) || $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) { // Ignore the token. @@ -1879,7 +1914,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { case '+FRAMESET': if ( 1 === $this->state->stack_of_open_elements->count() || - 'BODY' !== $this->state->stack_of_open_elements->at( 2 ) || + 'BODY' !== ( $this->state->stack_of_open_elements->at( 2 )->node_name ?? null ) || false === $this->state->frameset_ok ) { // Ignore the token. @@ -2075,7 +2110,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { 'ADDRESS' !== $node->node_name && 'DIV' !== $node->node_name && 'P' !== $node->node_name && - $this->is_special( $node->node_name ) + self::is_special( $node ) ) { /* * > If node is in the special category, but is not an address, div, @@ -2136,11 +2171,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > "button", "center", "details", "dialog", "dir", "div", "dl", "fieldset", * > "figcaption", "figure", "footer", "header", "hgroup", "listing", "main", * > "menu", "nav", "ol", "pre", "search", "section", "summary", "ul" - * - * @todo This needs to check if the element in scope is an HTML element, meaning that - * when SVG and MathML support is added, this needs to differentiate between an - * HTML element of the given name, such as `
`, and a foreign element of - * the same given name. */ case '-ADDRESS': case '-ARTICLE': @@ -2411,11 +2441,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /* * > A end tag token whose tag name is one of: "applet", "marquee", "object" - * - * @todo This needs to check if the element in scope is an HTML element, meaning that - * when SVG and MathML support is added, this needs to differentiate between an - * HTML element of the given name, such as ``, and a foreign element of - * the same given name. */ case '-APPLET': case '-MARQUEE': @@ -2679,9 +2704,12 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * These ought to be handled in the attribute methods. */ - - $this->bail( 'Cannot process MATH element, opening foreign content.' ); - break; + $this->state->current_token->namespace = 'math'; + $this->insert_html_element( $this->state->current_token ); + if ( $this->state->current_token->has_self_closing_flag ) { + $this->state->stack_of_open_elements->pop(); + } + return true; /* * > A start tag whose tag name is "svg" @@ -2695,9 +2723,12 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * These ought to be handled in the attribute methods. */ - - $this->bail( 'Cannot process SVG element, opening foreign content.' ); - break; + $this->state->current_token->namespace = 'svg'; + $this->insert_html_element( $this->state->current_token ); + if ( $this->state->current_token->has_self_closing_flag ) { + $this->state->stack_of_open_elements->pop(); + } + return true; /* * > A start tag whose tag name is one of: "caption", "col", "colgroup", @@ -2737,17 +2768,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * close anything beyond its containing `P` or `DIV` element. */ foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) { - /* - * @todo This needs to check if the element in scope is an HTML element, meaning that - * when SVG and MathML support is added, this needs to differentiate between an - * HTML element of the given name, such as ``, and a foreign element of - * the same given name. - */ - if ( $token_name === $node->node_name ) { + if ( 'html' === $node->namespace && $token_name === $node->node_name ) { break; } - if ( self::is_special( $node->node_name ) ) { + if ( self::is_special( $node ) ) { // This is a parse error, ignore the token. return $this->step(); } @@ -4069,7 +4094,284 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @return bool Whether an element was found. */ private function step_in_foreign_content(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_FOREIGN_CONTENT . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + /* + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + * + * This section drawn out above the switch to more easily incorporate + * the additional rules based on the presence of the attributes. + */ + if ( + '+FONT' === $op && + ( + null !== $this->get_attribute( 'color' ) || + null !== $this->get_attribute( 'face' ) || + null !== $this->get_attribute( 'size' ) + ) + ) { + $op = '+FONT with attributes'; + } + + switch ( $op ) { + case '#text': + /* + * > A character token that is U+0000 NULL + * + * This is handled by `get_modifiable_text()`. + */ + + /* + * Whitespace-only text does not affect the frameset-ok flag. + * It is probably inter-element whitespace, but it may also + * contain character references which decode only to whitespace. + */ + $text = $this->get_modifiable_text(); + if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) { + $this->state->frameset_ok = false; + } + + $this->insert_foreign_element( $this->state->current_token, false ); + return true; + + /* + * > A comment token + */ + case '#cdata-section': + case '#comment': + case '#funky_comment': + $this->insert_foreign_element( $this->state->current_token, false ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "b", "big", "blockquote", "body", "br", "center", + * > "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", + * > "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", + * > "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", + * > "table", "tt", "u", "ul", "var" + * + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + * + * > An end tag whose tag name is "br", "p" + * + * Closing BR tags are always reported by the Tag Processor as opening tags. + */ + case '+B': + case '+BIG': + case '+BLOCKQUOTE': + case '+BODY': + case '+BR': + case '+CENTER': + case '+CODE': + case '+DD': + case '+DIV': + case '+DL': + case '+DT': + case '+EM': + case '+EMBED': + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + case '+HEAD': + case '+HR': + case '+I': + case '+IMG': + case '+LI': + case '+LISTING': + case '+MENU': + case '+META': + case '+NOBR': + case '+OL': + case '+P': + case '+PRE': + case '+RUBY': + case '+S': + case '+SMALL': + case '+SPAN': + case '+STRONG': + case '+STRIKE': + case '+SUB': + case '+SUP': + case '+TABLE': + case '+TT': + case '+U': + case '+UL': + case '+VAR': + case '+FONT with attributes': + case '-BR': + case '-P': + // @todo Indicate a parse error once it's possible. + foreach ( $this->state->stack_of_open_elements->walk_up() as $current_node ) { + if ( + 'math' === $current_node->integration_node_type || + 'html' === $current_node->integration_node_type || + 'html' === $current_node->namespace + ) { + break; + } + + $this->state->stack_of_open_elements->pop(); + } + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other start tag + */ + if ( ! $this->is_tag_closer() ) { + $this->insert_foreign_element( $this->state->current_token, false ); + + /* + * > If the token has its self-closing flag set, then run + * > the appropriate steps from the following list: + */ + if ( $this->state->current_token->has_self_closing_flag ) { + if ( 'SCRIPT' === $this->state->current_token->node_name && 'svg' === $this->state->current_token->namespace ) { + /* + * > Acknowledge the token's self-closing flag, and then act as + * > described in the steps for a "script" end tag below. + * + * @todo Verify that this shouldn't be handled by the rule for + * "An end tag whose name is 'script', if the current node + * is an SVG script element." + */ + goto in_foreign_content_any_other_end_tag; + } else { + $this->state->stack_of_open_elements->pop(); + } + } + return true; + } + + /* + * > An end tag whose name is "script", if the current node is an SVG script element. + */ + if ( $this->is_tag_closer() && 'SCRIPT' === $this->state->current_token->node_name && 'svg' === $this->state->current_token->namespace ) { + $this->state->stack_of_open_elements->pop(); + } + + /* + * > Any other end tag + */ + if ( $this->is_tag_closer() ) { + in_foreign_content_any_other_end_tag: + $node = $this->state->stack_of_open_elements->current_node(); + if ( $tag_name !== $node->node_name ) { + // @todo Indicate a parse error once it's possible. + } + in_foreign_content_end_tag_loop: + if ( $node === $this->state->stack_of_open_elements->at( 1 ) ) { + return true; + } + + /* + * > If node's tag name, converted to ASCII lowercase, is the same as the tag name + * > of the token, pop elements from the stack of open elements until node has + * > been popped from the stack, and then return. + */ + if ( 0 === strcasecmp( $node->node_name, $tag_name ) ) { + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + return true; + } + } + } + + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + + if ( 'html' !== $node->namespace ) { + goto in_foreign_content_end_tag_loop; + } + + switch ( $this->state->insertion_mode ) { + case WP_HTML_Processor_State::INSERTION_MODE_INITIAL: + return $this->step_initial(); + + case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML: + return $this->step_before_html(); + + case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD: + return $this->step_before_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD: + return $this->step_in_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT: + return $this->step_in_head_noscript(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD: + return $this->step_after_head(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: + return $this->step_in_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE: + return $this->step_in_table(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT: + return $this->step_in_table_text(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION: + return $this->step_in_caption(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP: + return $this->step_in_column_group(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY: + return $this->step_in_table_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW: + return $this->step_in_row(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL: + return $this->step_in_cell(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT: + return $this->step_in_select(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE: + return $this->step_in_select_in_table(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE: + return $this->step_in_template(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY: + return $this->step_after_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET: + return $this->step_in_frameset(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET: + return $this->step_after_frameset(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY: + return $this->step_after_after_body(); + + case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET: + return $this->step_after_after_frameset(); + + // This should be unreachable but PHP doesn't have total type checking on switch. + default: + $this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." ); + } + } } /* @@ -4099,6 +4401,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * HTML semantic overrides for Tag Processor */ + /** + * Indicates the namespace of the current token, or "html" if there is none. + * + * @return string One of "html", "math", or "svg". + */ + public function get_namespace(): string { + if ( ! isset( $this->current_element ) ) { + return 'html'; + } + + return $this->current_element->token->namespace; + } + /** * Returns the uppercase name of the matched tag. * @@ -4734,6 +5049,28 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { } } + /** + * Returns the adjusted current node. + * + * > The adjusted current node is the context element if the parser was created as + * > part of the HTML fragment parsing algorithm and the stack of open elements + * > has only one element in it (fragment case); otherwise, the adjusted current + * > node is the current node. + * + * @see https://html.spec.whatwg.org/#adjusted-current-node + * + * @since 6.7.0 + * + * @return WP_HTML_Token|null The adjusted current node. + */ + private function get_adjusted_current_node(): ?WP_HTML_Token { + if ( isset( $this->context_node ) && 1 === $this->state->stack_of_open_elements->count() ) { + return $this->context_node; + } + + return $this->state->stack_of_open_elements->current_node(); + } + /** * Reconstructs the active formatting elements. * @@ -5043,7 +5380,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { continue; } - if ( self::is_special( $item->node_name ) ) { + if ( self::is_special( $item ) ) { $furthest_block = $item; break; } @@ -5111,6 +5448,45 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { $this->state->stack_of_open_elements->push( $token ); } + /** + * Inserts a foreign element on to the stack of open elements. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/#insert-a-foreign-element + * + * @param WP_HTML_Token $token Insert this token. The token's namespace and + * insertion point will be updated correctly. + * @param bool $only_add_to_element_stack Whether to skip the "insert an element at the adjusted + * insertion location" algorithm when adding this element. + */ + private function insert_foreign_element( WP_HTML_Token $token, bool $only_add_to_element_stack ): void { + $adjusted_current_node = $this->get_adjusted_current_node(); + + $token->namespace = $adjusted_current_node ? $adjusted_current_node->namespace : 'html'; + + if ( $this->is_mathml_integration_point() ) { + $token->integration_node_type = 'math'; + } elseif ( $this->is_html_integration_point() ) { + $token->integration_node_type = 'html'; + } + + if ( false === $only_add_to_element_stack ) { + /* + * @todo Implement the "appropriate place for inserting a node" and the + * "insert an element at the adjusted insertion location" algorithms. + * + * These algorithms mostly impacts DOM tree construction and not the HTML API. + * Here, there's no DOM node onto which the element will be appended, so the + * parser will skip this step. + * + * @see https://html.spec.whatwg.org/#insert-an-element-at-the-adjusted-insertion-location + */ + } + + $this->insert_html_element( $token ); + } + /** * Inserts a virtual element on the stack of open elements. * @@ -5136,6 +5512,88 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * HTML Specification Helpers */ + /** + * Indicates if the current token is a MathML integration point. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/#mathml-text-integration-point + * + * @return bool Whether the current token is a MathML integration point. + */ + private function is_mathml_integration_point(): bool { + $current_token = $this->state->current_token; + if ( ! isset( $current_token ) ) { + return false; + } + + if ( 'math' !== $current_token->namespace || 'M' !== $current_token->node_name[0] ) { + return false; + } + + $tag_name = $current_token->node_name; + + return ( + 'MI' === $tag_name || + 'MO' === $tag_name || + 'MN' === $tag_name || + 'MS' === $tag_name || + 'MTEXT' === $tag_name + ); + } + + /** + * Indicates if the current token is an HTML integration point. + * + * Note that this method must be an instance method with access + * to the current token, since it needs to examine the attributes + * of the currently-matched tag, if it's in the MathML namespace. + * Otherwise it would be required to scan the HTML and ensure that + * no other accounting is overlooked. + * + * @since 6.7.0 + * + * @see https://html.spec.whatwg.org/#html-integration-point + * + * @return bool Whether the current token is an HTML integration point. + */ + private function is_html_integration_point(): bool { + $current_token = $this->state->current_token; + if ( ! isset( $current_token ) ) { + return false; + } + + if ( 'html' === $current_token->namespace ) { + return false; + } + + $tag_name = $current_token->node_name; + + if ( 'svg' === $current_token->namespace ) { + return ( + 'DESC' === $tag_name || + 'FOREIGNOBJECT' === $tag_name || + 'TITLE' === $tag_name + ); + } + + if ( 'math' === $current_token->namespace ) { + if ( 'ANNOTATION-XML' !== $tag_name ) { + return false; + } + + $encoding = $this->get_attribute( 'encoding' ); + + return ( + is_string( $encoding ) && + ( + 0 === strcasecmp( $encoding, 'application/xhtml+xml' ) || + 0 === strcasecmp( $encoding, 'text/html' ) + ) + ); + } + } + /** * Returns whether an element of a given name is in the HTML special category. * @@ -5143,11 +5601,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * @see https://html.spec.whatwg.org/#special * - * @param string $tag_name Name of element to check. + * @param WP_HTML_Token|string $tag_name Node to check, or only its name if in the HTML namespace. * @return bool Whether the element of the given name is in the special category. */ public static function is_special( $tag_name ): bool { - $tag_name = strtoupper( $tag_name ); + if ( is_string( $tag_name ) ) { + $tag_name = strtoupper( $tag_name ); + } else { + $tag_name = 'html' === $tag_name->namespace + ? strtoupper( $tag_name->node_name ) + : "{$tag_name->namespace} {$tag_name->node_name}"; + } return ( 'ADDRESS' === $tag_name || @@ -5235,17 +5699,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { 'XMP' === $tag_name || // MathML. - 'MI' === $tag_name || - 'MO' === $tag_name || - 'MN' === $tag_name || - 'MS' === $tag_name || - 'MTEXT' === $tag_name || - 'ANNOTATION-XML' === $tag_name || + 'math MI' === $tag_name || + 'math MO' === $tag_name || + 'math MN' === $tag_name || + 'math MS' === $tag_name || + 'math MTEXT' === $tag_name || + 'math ANNOTATION-XML' === $tag_name || // SVG. - 'FOREIGNOBJECT' === $tag_name || - 'DESC' === $tag_name || - 'TITLE' === $tag_name + 'svg DESC' === $tag_name || + 'svg FOREIGNOBJECT' === $tag_name || + 'svg TITLE' === $tag_name ); } diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index 0ff2cdc4dd..fb21c15d1d 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -511,6 +511,23 @@ class WP_HTML_Tag_Processor { */ protected $parser_state = self::STATE_READY; + /** + * Indicates whether the parser is inside foreign content, + * e.g. inside an SVG or MathML element. + * + * One of 'html', 'svg', or 'math'. + * + * Several parsing rules change based on whether the parser + * is inside foreign content, including whether CDATA sections + * are allowed and whether a self-closing flag indicates that + * an element has no content. + * + * @since 6.7.0 + * + * @var string + */ + private $parsing_namespace = 'html'; + /** * What kind of syntax token became an HTML comment. * @@ -780,6 +797,25 @@ class WP_HTML_Tag_Processor { $this->html = $html; } + /** + * Switches parsing mode into a new namespace, such as when + * encountering an SVG tag and entering foreign content. + * + * @since 6.7.0 + * + * @param string $new_namespace One of 'html', 'svg', or 'math' indicating into what + * namespace the next tokens will be processed. + * @return bool Whether the namespace was valid and changed. + */ + public function change_parsing_namespace( string $new_namespace ): bool { + if ( ! in_array( $new_namespace, array( 'html', 'math', 'svg' ), true ) ) { + return false; + } + + $this->parsing_namespace = $new_namespace; + return true; + } + /** * Finds the next tag matching the $query. * @@ -843,6 +879,7 @@ class WP_HTML_Tag_Processor { * The Tag Processor currently only supports the tag token. * * @since 6.5.0 + * @since 6.7.0 Recognizes CDATA sections within foreign content. * * @return bool Whether a token was parsed. */ @@ -956,6 +993,7 @@ class WP_HTML_Tag_Processor { */ if ( $this->is_closing_tag || + 'html' !== $this->parsing_namespace || 1 !== strspn( $this->html, 'iIlLnNpPsStTxX', $this->tag_name_starts_at, 1 ) ) { return true; @@ -996,7 +1034,6 @@ class WP_HTML_Tag_Processor { $duplicate_attributes = $this->duplicate_attributes; // Find the closing tag if necessary. - $found_closer = false; switch ( $tag_name ) { case 'SCRIPT': $found_closer = $this->skip_script_data(); @@ -1759,6 +1796,32 @@ class WP_HTML_Tag_Processor { return true; } + if ( + 'html' !== $this->parsing_namespace && + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 9 ); + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + + $this->parser_state = self::STATE_CDATA_NODE; + $this->text_starts_at = $at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; + } + /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. If no closer is @@ -2653,6 +2716,17 @@ class WP_HTML_Tag_Processor { return $matches; } + /** + * Returns the namespace of the matched token. + * + * @since 6.7.0 + * + * @return string One of 'html', 'math', or 'svg'. + */ + public function get_namespace(): string { + return $this->parsing_namespace; + } + /** * Returns the uppercase name of the matched tag. * @@ -2690,6 +2764,388 @@ class WP_HTML_Tag_Processor { return null; } + /** + * Returns the adjusted tag name for a given token, taking into + * account the current parsing context, whether HTML, SVG, or MathML. + * + * @since 6.7.0 + * + * @return string|null Name of current tag name. + */ + public function get_qualified_tag_name(): ?string { + $tag_name = $this->get_tag(); + if ( null === $tag_name ) { + return null; + } + + if ( 'html' === $this->get_namespace() ) { + return $tag_name; + } + + $lower_tag_name = strtolower( $tag_name ); + if ( 'math' === $this->get_namespace() ) { + return $lower_tag_name; + } + + if ( 'svg' === $this->get_namespace() ) { + switch ( $lower_tag_name ) { + case 'altglyph': + return 'altGlyph'; + + case 'altglyphdef': + return 'altGlyphDef'; + + case 'altglyphitem': + return 'altGlyphItem'; + + case 'animatecolor': + return 'animateColor'; + + case 'animatemotion': + return 'animateMotion'; + + case 'animatetransform': + return 'animateTransform'; + + case 'clippath': + return 'clipPath'; + + case 'feblend': + return 'feBlend'; + + case 'fecolormatrix': + return 'feColorMatrix'; + + case 'fecomponenttransfer': + return 'feComponentTransfer'; + + case 'fecomposite': + return 'feComposite'; + + case 'feconvolvematrix': + return 'feConvolveMatrix'; + + case 'fediffuselighting': + return 'feDiffuseLighting'; + + case 'fedisplacementmap': + return 'feDisplacementMap'; + + case 'fedistantlight': + return 'feDistantLight'; + + case 'fedropshadow': + return 'feDropShadow'; + + case 'feflood': + return 'feFlood'; + + case 'fefunca': + return 'feFuncA'; + + case 'fefuncb': + return 'feFuncB'; + + case 'fefuncg': + return 'feFuncG'; + + case 'fefuncr': + return 'feFuncR'; + + case 'fegaussianblur': + return 'feGaussianBlur'; + + case 'feimage': + return 'feImage'; + + case 'femerge': + return 'feMerge'; + + case 'femergenode': + return 'feMergeNode'; + + case 'femorphology': + return 'feMorphology'; + + case 'feoffset': + return 'feOffset'; + + case 'fepointlight': + return 'fePointLight'; + + case 'fespecularlighting': + return 'feSpecularLighting'; + + case 'fespotlight': + return 'feSpotLight'; + + case 'fetile': + return 'feTile'; + + case 'feturbulence': + return 'feTurbulence'; + + case 'foreignobject': + return 'foreignObject'; + + case 'glyphref': + return 'glyphRef'; + + case 'lineargradient': + return 'linearGradient'; + + case 'radialgradient': + return 'radialGradient'; + + case 'textpath': + return 'textPath'; + + default: + return $lower_tag_name; + } + } + } + + /** + * Returns the adjusted attribute name for a given attribute, taking into + * account the current parsing context, whether HTML, SVG, or MathML. + * + * @since 6.7.0 + * + * @param string $attribute_name Which attribute to adjust. + * + * @return string|null + */ + public function get_qualified_attribute_name( $attribute_name ): ?string { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return null; + } + + $namespace = $this->get_namespace(); + $lower_name = strtolower( $attribute_name ); + + if ( 'math' === $namespace && 'definitionurl' === $lower_name ) { + return 'definitionURL'; + } + + if ( 'svg' === $this->get_namespace() ) { + switch ( $lower_name ) { + case 'attributename': + return 'attributeName'; + + case 'attributetype': + return 'attributeType'; + + case 'basefrequency': + return 'baseFrequency'; + + case 'baseprofile': + return 'baseProfile'; + + case 'calcmode': + return 'calcMode'; + + case 'clippathunits': + return 'clipPathUnits'; + + case 'diffuseconstant': + return 'diffuseConstant'; + + case 'edgemode': + return 'edgeMode'; + + case 'filterunits': + return 'filterUnits'; + + case 'glyphref': + return 'glyphRef'; + + case 'gradienttransform': + return 'gradientTransform'; + + case 'gradientunits': + return 'gradientUnits'; + + case 'kernelmatrix': + return 'kernelMatrix'; + + case 'kernelunitlength': + return 'kernelUnitLength'; + + case 'keypoints': + return 'keyPoints'; + + case 'keysplines': + return 'keySplines'; + + case 'keytimes': + return 'keyTimes'; + + case 'lengthadjust': + return 'lengthAdjust'; + + case 'limitingconeangle': + return 'limitingConeAngle'; + + case 'markerheight': + return 'markerHeight'; + + case 'markerunits': + return 'markerUnits'; + + case 'markerwidth': + return 'markerWidth'; + + case 'maskcontentunits': + return 'maskContentUnits'; + + case 'maskunits': + return 'maskUnits'; + + case 'numoctaves': + return 'numOctaves'; + + case 'pathlength': + return 'pathLength'; + + case 'patterncontentunits': + return 'patternContentUnits'; + + case 'patterntransform': + return 'patternTransform'; + + case 'patternunits': + return 'patternUnits'; + + case 'pointsatx': + return 'pointsAtX'; + + case 'pointsaty': + return 'pointsAtY'; + + case 'pointsatz': + return 'pointsAtZ'; + + case 'preservealpha': + return 'preserveAlpha'; + + case 'preserveaspectratio': + return 'preserveAspectRatio'; + + case 'primitiveunits': + return 'primitiveUnits'; + + case 'refx': + return 'refX'; + + case 'refy': + return 'refY'; + + case 'repeatcount': + return 'repeatCount'; + + case 'repeatdur': + return 'repeatDur'; + + case 'requiredextensions': + return 'requiredExtensions'; + + case 'requiredfeatures': + return 'requiredFeatures'; + + case 'specularconstant': + return 'specularConstant'; + + case 'specularexponent': + return 'specularExponent'; + + case 'spreadmethod': + return 'spreadMethod'; + + case 'startoffset': + return 'startOffset'; + + case 'stddeviation': + return 'stdDeviation'; + + case 'stitchtiles': + return 'stitchTiles'; + + case 'surfacescale': + return 'surfaceScale'; + + case 'systemlanguage': + return 'systemLanguage'; + + case 'tablevalues': + return 'tableValues'; + + case 'targetx': + return 'targetX'; + + case 'targety': + return 'targetY'; + + case 'textlength': + return 'textLength'; + + case 'viewbox': + return 'viewBox'; + + case 'viewtarget': + return 'viewTarget'; + + case 'xchannelselector': + return 'xChannelSelector'; + + case 'ychannelselector': + return 'yChannelSelector'; + + case 'zoomandpan': + return 'zoomAndPan'; + } + } + + if ( 'html' !== $namespace ) { + switch ( $lower_name ) { + case 'xlink:actuate': + return 'xlink actuate'; + + case 'xlink:arcrole': + return 'xlink arcrole'; + + case 'xlink:href': + return 'xlink href'; + + case 'xlink:role': + return 'xlink role'; + + case 'xlink:show': + return 'xlink show'; + + case 'xlink:title': + return 'xlink title'; + + case 'xlink:type': + return 'xlink type'; + + case 'xml:lang': + return 'xml lang'; + + case 'xml:space': + return 'xml space'; + + case 'xmlns': + return 'xmlns'; + + case 'xmlns:xlink': + return 'xmlns xlink'; + } + } + + return $attribute_name; + } + /** * Indicates if the currently matched tag contains the self-closing flag. * @@ -2963,8 +3419,12 @@ class WP_HTML_Tag_Processor { * In all other contexts it's replaced by the replacement character (U+FFFD) * for security reasons (to avoid joining together strings that were safe * when separated, but not when joined). + * + * @todo Inside HTML integration points and MathML integration points, the + * text is processed according to the insertion mode, not according + * to the foreign content rules. This should strip the NULL bytes. */ - return '#text' === $tag_name + return ( '#text' === $tag_name && 'html' === $this->get_namespace() ) ? str_replace( "\x00", '', $decoded ) : str_replace( "\x00", "\u{FFFD}", $decoded ); } diff --git a/wp-includes/html-api/class-wp-html-token.php b/wp-includes/html-api/class-wp-html-token.php index 948fe343df..d5e51ac290 100644 --- a/wp-includes/html-api/class-wp-html-token.php +++ b/wp-includes/html-api/class-wp-html-token.php @@ -60,6 +60,24 @@ class WP_HTML_Token { */ public $has_self_closing_flag = false; + /** + * Indicates if the element is an HTML element or if it's inside foreign content. + * + * @since 6.7.0 + * + * @var string 'html', 'svg', or 'math'. + */ + public $namespace = 'html'; + + /** + * Indicates which kind of integration point the element is, if any. + * + * @since 6.7.0 + * + * @var string|null 'math', 'html', or null if not an integration point. + */ + public $integration_node_type = null; + /** * Called when token is garbage-collected or otherwise destroyed. * @@ -80,6 +98,7 @@ class WP_HTML_Token { */ public function __construct( ?string $bookmark_name, string $node_name, bool $has_self_closing_flag, ?callable $on_destroy = null ) { $this->bookmark_name = $bookmark_name; + $this->namespace = 'html'; $this->node_name = $node_name; $this->has_self_closing_flag = $has_self_closing_flag; $this->on_destroy = $on_destroy; diff --git a/wp-includes/version.php b/wp-includes/version.php index f4d8950bd7..6c144b0662 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.7-alpha-58866'; +$wp_version = '6.7-alpha-58867'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.