HTML API: Add missing subclass methods to HTML Processor and add token provenance.

This patch introduces two related changes:

 - It adds missing subclass methods on the HTML Processor which needed
   to be implemented since it started visiting virtual nodes. These
   methods need to account for the fact that not all tokens truly exist.

 - It adds a new concept and internal method, `is_virtual()`, indicating
   if the currently-matched token comes from the raw text in the input
   HTML document or if it was the byproduct of semantic parsing rules.
   This internal method and new vocabulary around token provenance
   considerably simplifies the logic spread throughout the rest of the
   class and its subclass methods.

Developed in https://github.com/WordPress/wordpress-develop/pull/6860
Discussed in https://core.trac.wordpress.org/ticket/61348

Follow-up to [58304].

Props dmsnell, jonsurrell, gziolo.
See #61348.

Built from https://develop.svn.wordpress.org/trunk@58558


git-svn-id: http://core.svn.wordpress.org/trunk@58006 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
dmsnell 2024-06-25 03:11:19 +00:00
parent e5d85a4490
commit a636cd1c42
3 changed files with 187 additions and 56 deletions

View File

@ -349,13 +349,19 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
$this->state->stack_of_open_elements->set_push_handler( $this->state->stack_of_open_elements->set_push_handler(
function ( WP_HTML_Token $token ) { function ( WP_HTML_Token $token ) {
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH ); $is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer();
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance );
} }
); );
$this->state->stack_of_open_elements->set_pop_handler( $this->state->stack_of_open_elements->set_pop_handler(
function ( WP_HTML_Token $token ) { function ( WP_HTML_Token $token ) {
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP ); $is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer();
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance );
} }
); );
@ -569,11 +575,26 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether the current tag is a tag closer. * @return bool Whether the current tag is a tag closer.
*/ */
public function is_tag_closer() { public function is_tag_closer() {
return isset( $this->current_element ) return $this->is_virtual()
? ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) ? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() )
: parent::is_tag_closer(); : parent::is_tag_closer();
} }
/**
* Indicates if the currently-matched token is virtual, created by a stack operation
* while processing HTML, rather than a token found in the HTML text itself.
*
* @since 6.6.0
*
* @return bool Whether the current token is virtual.
*/
private function is_virtual() {
return (
isset( $this->current_element->provenance ) &&
'virtual' === $this->current_element->provenance
);
}
/** /**
* Indicates if the currently-matched tag matches the given breadcrumbs. * Indicates if the currently-matched tag matches the given breadcrumbs.
* *
@ -1440,7 +1461,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return null; return null;
} }
if ( isset( $this->current_element ) ) { if ( $this->is_virtual() ) {
return $this->current_element->token->node_name; return $this->current_element->token->node_name;
} }
@ -1459,6 +1480,27 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
} }
} }
/**
* Indicates if the currently matched tag contains the self-closing flag.
*
* No HTML elements ought to have the self-closing flag and for those, the self-closing
* flag will be ignored. For void elements this is benign because they "self close"
* automatically. For non-void HTML elements though problems will appear if someone
* intends to use a self-closing element in place of that element with an empty body.
* For HTML foreign elements and custom elements the self-closing flag determines if
* they self-close or not.
*
* This function does not determine if a tag is self-closing,
* but only if the self-closing flag is present in the syntax.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag() {
return $this->is_virtual() ? false : parent::has_self_closing_flag();
}
/** /**
* Returns the node name represented by the token. * Returns the node name represented by the token.
* *
@ -1480,11 +1522,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return string|null Name of the matched token. * @return string|null Name of the matched token.
*/ */
public function get_token_name() { public function get_token_name() {
if ( isset( $this->current_element ) ) { return $this->is_virtual()
return $this->current_element->token->node_name; ? $this->current_element->token->node_name
} : parent::get_token_name();
return parent::get_token_name();
} }
/** /**
@ -1510,9 +1550,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return string|null What kind of token is matched, or null. * @return string|null What kind of token is matched, or null.
*/ */
public function get_token_type() { public function get_token_type() {
if ( isset( $this->current_element ) ) { if ( $this->is_virtual() ) {
/*
* This logic comes from the Tag Processor.
*
* @todo It would be ideal not to repeat this here, but it's not clearly
* better to allow passing a token name to `get_token_type()`.
*/
$node_name = $this->current_element->token->node_name; $node_name = $this->current_element->token->node_name;
if ( ctype_upper( $node_name[0] ) ) { $starting_char = $node_name[0];
if ( 'A' <= $starting_char && 'Z' >= $starting_char ) {
return '#tag'; return '#tag';
} }
@ -1546,25 +1593,38 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`. * @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
*/ */
public function get_attribute( $name ) { public function get_attribute( $name ) {
if ( isset( $this->current_element ) ) { return $this->is_virtual() ? null : parent::get_attribute( $name );
// Closing tokens cannot contain attributes.
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
return null;
} }
$node_name = $this->current_element->token->node_name; /**
* Updates or creates a new attribute on the currently matched tag with the passed value.
// Only tags can contain attributes. *
if ( 'A' > $node_name[0] || 'Z' < $node_name[0] ) { * For boolean attributes special handling is provided:
return null; * - When `true` is passed as the value, then only the attribute name is added to the tag.
* - When `false` is passed, the attribute gets removed if it existed before.
*
* For string attributes, the value is escaped using the `esc_attr` function.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $name The attribute name to target.
* @param string|bool $value The new attribute value.
* @return bool Whether an attribute value was set.
*/
public function set_attribute( $name, $value ) {
return $this->is_virtual() ? false : parent::set_attribute( $name, $value );
} }
if ( $this->current_element->token->bookmark_name === (string) $this->bookmark_counter ) { /**
return parent::get_attribute( $name ); * Remove an attribute from the currently-matched tag.
} *
} * @since 6.6.0 Subclassed for HTML Processor.
*
return null; * @param string $name The attribute name to remove.
* @return bool Whether an attribute was removed.
*/
public function remove_attribute( $name ) {
return $this->is_virtual() ? false : parent::remove_attribute( $name );
} }
/** /**
@ -1594,18 +1654,63 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return array|null List of attribute names, or `null` when no tag opener is matched. * @return array|null List of attribute names, or `null` when no tag opener is matched.
*/ */
public function get_attribute_names_with_prefix( $prefix ) { public function get_attribute_names_with_prefix( $prefix ) {
if ( isset( $this->current_element ) ) { return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix );
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
return null;
} }
$mark = $this->bookmarks[ $this->current_element->token->bookmark_name ]; /**
if ( 0 === $mark->length ) { * Adds a new class name to the currently matched tag.
return null; *
} * @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to add.
* @return bool Whether the class was set to be added.
*/
public function add_class( $class_name ) {
return $this->is_virtual() ? false : parent::add_class( $class_name );
} }
return parent::get_attribute_names_with_prefix( $prefix ); /**
* Removes a class name from the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to remove.
* @return bool Whether the class was set to be removed.
*/
public function remove_class( $class_name ) {
return $this->is_virtual() ? false : parent::remove_class( $class_name );
}
/**
* Returns if a matched tag contains the given ASCII case-insensitive class name.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ) {
return $this->is_virtual() ? null : parent::has_class( $wanted_class );
}
/**
* Generator for a foreach loop to step through each class name for the matched tag.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( "<div class='free &lt;egg&lt;\tlang-en'>" );
* $p->next_tag();
* foreach ( $p->class_list() as $class_name ) {
* echo "{$class_name} ";
* }
* // Outputs: "free <egg> lang-en "
*
* @since 6.6.0 Subclassed for the HTML Processor.
*/
public function class_list() {
return $this->is_virtual() ? null : parent::class_list();
} }
/** /**
@ -1629,17 +1734,30 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return string * @return string
*/ */
public function get_modifiable_text() { public function get_modifiable_text() {
if ( isset( $this->current_element ) ) { return $this->is_virtual() ? '' : parent::get_modifiable_text();
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
return '';
} }
$mark = $this->bookmarks[ $this->current_element->token->bookmark_name ]; /**
if ( 0 === $mark->length ) { * Indicates what kind of comment produced the comment node.
return ''; *
} * Because there are different kinds of HTML syntax which produce
} * comments, the Tag Processor tracks and exposes this as a type
return parent::get_modifiable_text(); * for the comment. Nominally only regular HTML comments exist as
* they are commonly known, but a number of unrelated syntax errors
* also produce comments.
*
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
* @see self::COMMENT_AS_CDATA_LOOKALIKE
* @see self::COMMENT_AS_INVALID_HTML
* @see self::COMMENT_AS_HTML_COMMENT
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null
*/
public function get_comment_type() {
return $this->is_virtual() ? null : parent::get_comment_type();
} }
/** /**

View File

@ -56,14 +56,27 @@ class WP_HTML_Stack_Event {
*/ */
public $operation; public $operation;
/**
* Indicates if the stack element is a real or virtual node.
*
* @since 6.6.0
*
* @var string
*/
public $provenance;
/** /**
* Constructor function. * Constructor function.
* *
* @since 6.6.0
*
* @param WP_HTML_Token $token Token associated with stack event, always an opening token. * @param WP_HTML_Token $token Token associated with stack event, always an opening token.
* @param string $operation One of self::PUSH or self::POP. * @param string $operation One of self::PUSH or self::POP.
* @param string $provenance "virtual" or "real".
*/ */
public function __construct( $token, $operation ) { public function __construct( $token, $operation, $provenance ) {
$this->token = $token; $this->token = $token;
$this->operation = $operation; $this->operation = $operation;
$this->provenance = $provenance;
} }
} }

View File

@ -16,7 +16,7 @@
* *
* @global string $wp_version * @global string $wp_version
*/ */
$wp_version = '6.6-beta4-58557'; $wp_version = '6.6-beta4-58558';
/** /**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.