HTML API: Add method to create fragment at node.

HTML Fragment parsing always happens with a context node, which may impact how a fragment of HTML is parsed. HTML Fragment Processors can be instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`.

This changeset adds a static method called `create_fragment_at_current_node( string $html_fragment )`. It can only be called when the processor is paused at a `#tag`, with some additional constraints:

- The opening and closing tags must appear in the HTML input (no virtual tokens).
- No "self-contained" elements are allowed ( `IFRAME`, `SCRIPT`, `TITLE`, etc.).

If successful, the method will return a `WP_HTML_Processor` instance whose context is inherited from the node that the method was called from.

Props jonsurrell, bernhard-reiter, gziolo.
Fixes #62357.
Built from https://develop.svn.wordpress.org/trunk@59444


git-svn-id: http://core.svn.wordpress.org/trunk@58830 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
Bernhard Reiter 2024-11-21 13:29:18 +00:00
parent 8f1dc00b4d
commit ff3fde39ee
2 changed files with 115 additions and 1 deletions

View File

@ -424,6 +424,120 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
};
}
/**
* Creates a fragment processor at the current node.
*
* HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be
* instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`.
*
* The context node may impact how a fragment of HTML is parsed. For example, consider the HTML
* fragment `<td />Inside TD?</td>`.
*
* A BODY context node will produce the following tree:
*
* └─#text Inside TD?
*
* Notice that the `<td>` tags are completely ignored.
*
* Compare that with an SVG context node that produces the following tree:
*
* ├─svg:td
* └─#text Inside TD?
*
* Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected.
* This is a peculiarity of parsing HTML in foreign content like SVG.
*
* Finally, consider the tree produced with a TABLE context node:
*
* └─TBODY
* └─TR
* └─TD
* └─#text Inside TD?
*
* These examples demonstrate how important the context node may be when processing an HTML
* fragment. Special care must be taken when processing fragments that are expected to appear
* in specific contexts. SVG and TABLE are good examples, but there are others.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
*
* @param string $html Input HTML fragment to process.
* @return static|null The created processor if successful, otherwise null.
*/
public function create_fragment_at_current_node( string $html ) {
if ( $this->get_token_type() !== '#tag' ) {
return null;
}
$namespace = $this->current_element->token->namespace;
/*
* Prevent creating fragments at nodes that require a special tokenizer state.
* This is unsupported by the HTML Processor.
*/
if (
'html' === $namespace &&
in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
) {
return null;
}
$fragment_processor = static::create_fragment( $html );
if ( null === $fragment_processor ) {
return null;
}
$fragment_processor->compat_mode = $this->compat_mode;
$fragment_processor->context_node = clone $this->state->current_token;
$fragment_processor->context_node->bookmark_name = 'context-node';
$fragment_processor->context_node->on_destroy = null;
$fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() );
$attribute_names = $this->get_attribute_names_with_prefix( '' );
if ( null !== $attribute_names ) {
foreach ( $attribute_names as $name ) {
$fragment_processor->state->context_node[1][ $name ] = $this->get_attribute( $name );
}
}
$fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name );
if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) {
$fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE;
}
$fragment_processor->reset_insertion_mode_appropriately();
/*
* > Set the parser's form element pointer to the nearest node to the context element that
* > is a form element (going straight up the ancestor chain, and including the element
* > itself, if it is a form element), if any. (If there is no such form element, the
* > form element pointer keeps its initial value, null.)
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) {
if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) {
$fragment_processor->state->form_element = clone $element;
$fragment_processor->state->form_element->bookmark_name = null;
$fragment_processor->state->form_element->on_destroy = null;
break;
}
}
$fragment_processor->state->encoding_confidence = 'irrelevant';
/*
* Update the parsing namespace near the end of the process.
* This is important so that any push/pop from the stack of open
* elements does not change the parsing namespace.
*/
$fragment_processor->change_parsing_namespace(
$this->current_element->token->integration_node_type ? 'html' : $namespace
);
return $fragment_processor;
}
/**
* Stops the parser and terminates its execution when encountering unsupported markup.
*

View File

@ -16,7 +16,7 @@
*
* @global string $wp_version
*/
$wp_version = '6.8-alpha-59443';
$wp_version = '6.8-alpha-59444';
/**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.