HTML API: Add context to Unsupported_Exception class for improved debugging.

The HTML Processor internally throws an exception when it reaches HTML
that it knows it cannot process, but this exception is not made
available to calling code. It can be useful to extract more knowledge
about why it gave up, especially for debugging purposes.

In this patch, more context is added to the WP_HTML_Unsupported_Exception
and the last exception is made available to calling code through a new
method, `get_unsupported_exception()`.

Developed in https://github.com/WordPress/wordpress-develop/pull/6985
Discussed in https://core.trac.wordpress.org/ticket/61646

Props bernhard-reiter, dmsnell, jonsurrell.
See #61646.

Built from https://develop.svn.wordpress.org/trunk@58714


git-svn-id: http://core.svn.wordpress.org/trunk@58116 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
dmsnell 2024-07-12 22:29:13 +00:00
parent 519a8f6bbd
commit 4f85cc258c
3 changed files with 180 additions and 57 deletions

View File

@ -188,6 +188,17 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/ */
private $last_error = null; private $last_error = null;
/**
* Stores context for why the parser bailed on unsupported HTML, if it did.
*
* @see self::get_unsupported_exception
*
* @since 6.7.0
*
* @var WP_HTML_Unsupported_Exception|null
*/
private $unsupported_exception = null;
/** /**
* Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance.
* *
@ -384,6 +395,45 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
}; };
} }
/**
* Stops the parser and terminates its execution when encountering unsupported markup.
*
* @throws WP_HTML_Unsupported_Exception Halts execution of the parser.
*
* @since 6.7.0
*
* @param string $message Explains support is missing in order to parse the current node.
*
* @return mixed
*/
private function bail( string $message ) {
$here = $this->bookmarks[ $this->state->current_token->bookmark_name ];
$token = substr( $this->html, $here->start, $here->length );
$open_elements = array();
foreach ( $this->state->stack_of_open_elements->stack as $item ) {
$open_elements[] = $item->node_name;
}
$active_formats = array();
foreach ( $this->state->active_formatting_elements->walk_down() as $item ) {
$active_formats[] = $item->node_name;
}
$this->last_error = self::ERROR_UNSUPPORTED;
$this->unsupported_exception = new WP_HTML_Unsupported_Exception(
$message,
$this->state->current_token->node_name,
$here->start,
$token,
$open_elements,
$active_formats
);
throw $this->unsupported_exception;
}
/** /**
* Returns the last error, if any. * Returns the last error, if any.
* *
@ -411,6 +461,21 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return $this->last_error; return $this->last_error;
} }
/**
* Returns context for why the parser aborted due to unsupported HTML, if it did.
*
* This is meant for debugging purposes, not for production use.
*
* @since 6.7.0
*
* @see self::$unsupported_exception
*
* @return WP_HTML_Unsupported_Exception|null
*/
public function get_unsupported_exception() {
return $this->unsupported_exception;
}
/** /**
* Finds the next tag matching the $query. * Finds the next tag matching the $query.
* *
@ -841,8 +906,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
// This should be unreachable but PHP doesn't have total type checking on switch. // This should be unreachable but PHP doesn't have total type checking on switch.
default: default:
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." );
throw new WP_HTML_Unsupported_Exception( "Found unrecognized insertion mode '{$this->state->insertion_mode}'." );
} }
} catch ( WP_HTML_Unsupported_Exception $e ) { } catch ( WP_HTML_Unsupported_Exception $e ) {
/* /*
@ -922,8 +986,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_initial() { private function step_initial() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -942,8 +1005,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_before_html() { private function step_before_html() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -962,8 +1024,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_before_head() { private function step_before_head() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -982,8 +1043,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_head() { private function step_in_head() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1002,8 +1062,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_head_noscript() { private function step_in_head_noscript() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1022,8 +1081,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_after_head() { private function step_after_head() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1445,8 +1503,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > than the end tag token that it actually is. * > than the end tag token that it actually is.
*/ */
case '-BR': case '-BR':
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( 'Closing BR tags require unimplemented special handling.' );
throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' ); // This return required because PHPCS can't determine that the call to bail() throws.
return false;
/* /*
* > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr" * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
@ -1602,8 +1661,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
case 'TITLE': case 'TITLE':
case 'TR': case 'TR':
case 'XMP': case 'XMP':
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "Cannot process {$token_name} element." );
throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
} }
if ( ! parent::is_tag_closer() ) { if ( ! parent::is_tag_closer() ) {
@ -1665,8 +1723,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_table() { private function step_in_table() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1685,8 +1742,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_table_text() { private function step_in_table_text() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1705,8 +1761,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_caption() { private function step_in_caption() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1725,8 +1780,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_column_group() { private function step_in_column_group() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1745,8 +1799,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_table_body() { private function step_in_table_body() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1765,8 +1818,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_row() { private function step_in_row() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1785,8 +1837,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_cell() { private function step_in_cell() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -1986,8 +2037,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_select_in_table() { private function step_in_select_in_table() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2006,8 +2056,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_template() { private function step_in_template() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2026,8 +2075,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_after_body() { private function step_after_body() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2046,8 +2094,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_frameset() { private function step_in_frameset() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2066,8 +2113,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_after_frameset() { private function step_after_frameset() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2086,8 +2132,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_after_after_body() { private function step_after_after_body() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2106,8 +2151,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_after_after_frameset() { private function step_after_after_frameset() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/** /**
@ -2126,8 +2170,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* @return bool Whether an element was found. * @return bool Whether an element was found.
*/ */
private function step_in_foreign_content() { private function step_in_foreign_content() {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( "No support for parsing in the '{$this->state->insertion_mode}' state." );
throw new WP_HTML_Unsupported_Exception( "No support for parsing in the '{$this->state->insertion_mode}' state." );
} }
/* /*
@ -2835,8 +2878,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
return false; return false;
} }
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' );
throw new WP_HTML_Unsupported_Exception( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' );
} }
/** /**
@ -3072,8 +3114,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
// > If there is no such element, then return and instead act as described in the "any other end tag" entry above. // > If there is no such element, then return and instead act as described in the "any other end tag" entry above.
if ( null === $formatting_element ) { if ( null === $formatting_element ) {
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( 'Cannot run adoption agency when "any other end tag" is required.' );
throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when "any other end tag" is required.' );
} }
// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return. // > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
@ -3125,12 +3166,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
} }
} }
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( 'Cannot extract common ancestor in adoption agency algorithm.' );
throw new WP_HTML_Unsupported_Exception( 'Cannot extract common ancestor in adoption agency algorithm.' );
} }
$this->last_error = self::ERROR_UNSUPPORTED; $this->bail( 'Cannot run adoption agency when looping required.' );
throw new WP_HTML_Unsupported_Exception( 'Cannot run adoption agency when looping required.' );
} }
/** /**

View File

@ -21,11 +21,95 @@
* operation and signify that the given HTML cannot be processed. * operation and signify that the given HTML cannot be processed.
* *
* @since 6.4.0 * @since 6.4.0
* @since 6.7.0 Gained contextual information for use in debugging parse failures.
* *
* @access private * @access private
* *
* @see WP_HTML_Processor * @see WP_HTML_Processor
*/ */
class WP_HTML_Unsupported_Exception extends Exception { class WP_HTML_Unsupported_Exception extends Exception {
/**
* Name of the matched token when the exception was raised,
* if matched on a token.
*
* This does not imply that the token itself was unsupported, but it
* may have been the case that the token triggered part of the HTML
* parsing that isn't supported, such as the adoption agency algorithm.
*
* @since 6.7.0
*
* @var string
*/
public $token_name;
/**
* Number of bytes into the input HTML document where the parser was
* parsing when the exception was raised.
*
* Use this to reconstruct context for the failure.
*
* @since 6.7.0
*
* @var int
*/
public $token_at;
/**
* Full raw text of the matched token when the exception was raised,
* if matched on a token.
*
* Whereas the `$token_name` will be normalized, this contains the full
* raw text of the token, including original casing, duplicated attributes,
* and other syntactic variations that are normally abstracted in the HTML API.
*
* @since 6.7.0
*
* @var string
*/
public $token;
/**
* Stack of open elements when the exception was raised.
*
* Use this to trace the parsing circumstances which led to the exception.
*
* @since 6.7.0
*
* @var string[]
*/
public $stack_of_open_elements = array();
/**
* List of active formatting elements when the exception was raised.
*
* Use this to trace the parsing circumstances which led to the exception.
*
* @since 6.7.0
*
* @var string[]
*/
public $active_formatting_elements = array();
/**
* Constructor function.
*
* @since 6.7.0
*
* @param string $message Brief message explaining what is unsupported, the reason this exception was raised.
* @param string $token_name Normalized name of matched token when this exception was raised.
* @param int $token_at Number of bytes into source HTML document where matched token starts.
* @param string $token Full raw text of matched token when this exception was raised.
* @param string[] $stack_of_open_elements Stack of open elements when this exception was raised.
* @param string[] $active_formatting_elements List of active formatting elements when this exception was raised.
*/
public function __construct( string $message, string $token_name, int $token_at, string $token, array $stack_of_open_elements, array $active_formatting_elements ) {
parent::__construct( $message );
$this->token_name = $token_name;
$this->token_at = $token_at;
$this->token = $token;
$this->stack_of_open_elements = $stack_of_open_elements;
$this->active_formatting_elements = $active_formatting_elements;
}
} }

View File

@ -16,7 +16,7 @@
* *
* @global string $wp_version * @global string $wp_version
*/ */
$wp_version = '6.7-alpha-58713'; $wp_version = '6.7-alpha-58714';
/** /**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.