HTML API: Defer applying attribute updates until necessary.

When making repeated updates to a document, the Tag Processor will end
up copying the entire document once for every update. This can lead to
catastrophic behavior in the worse case.

However, when batch-applying updates it's able to copy chunks of the
document in one thread and only end up copying the entire document once
for the entire batch.

Previously the Tag Processor has been eagerly applying udpates, but in
this patch it defers applying those updates as long as is possible.
Developed in https://github.com/WordPress/wordpress-develop/pull/6120
Discussed in https://core.trac.wordpress.org/ticket/60697

Follow-up to [55706], [56941], [57348].

Reviewed by swissspidy.
Merges [57805] to the to the 6.5 branch.

Props dmsnell, bernhard-reiter, jonsurrell, westonruter.
Fixes #60697.




Built from https://develop.svn.wordpress.org/branches/6.5@57815


git-svn-id: http://core.svn.wordpress.org/branches/6.5@57316 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
audrasjb 2024-03-12 14:27:14 +00:00
parent 5095c174d4
commit b1cb307a40
2 changed files with 62 additions and 12 deletions

View File

@ -837,8 +837,27 @@ class WP_HTML_Tag_Processor {
* @return bool Whether a token was parsed. * @return bool Whether a token was parsed.
*/ */
public function next_token() { public function next_token() {
return $this->base_class_next_token();
}
/**
* Internal method which finds the next token in the HTML document.
*
* This method is a protected internal function which implements the logic for
* finding the next token in a document. It exists so that the parser can update
* its state without affecting the location of the cursor in the document and
* without triggering subclass methods for things like `next_token()`, e.g. when
* applying patches before searching for the next token.
*
* @since 6.5.0
*
* @access private
*
* @return bool Whether a token was parsed.
*/
private function base_class_next_token() {
$was_at = $this->bytes_already_parsed; $was_at = $this->bytes_already_parsed;
$this->get_updated_html(); $this->after_tag();
// Don't proceed if there's nothing more to scan. // Don't proceed if there's nothing more to scan.
if ( if (
@ -2041,6 +2060,45 @@ class WP_HTML_Tag_Processor {
* @since 6.2.0 * @since 6.2.0
*/ */
private function after_tag() { private function after_tag() {
/*
* There could be lexical updates enqueued for an attribute that
* also exists on the next tag. In order to avoid conflating the
* attributes across the two tags, lexical updates with names
* need to be flushed to raw lexical updates.
*/
$this->class_name_updates_to_attributes_updates();
/*
* Purge updates if there are too many. The actual count isn't
* scientific, but a few values from 100 to a few thousand were
* tests to find a practially-useful limit.
*
* If the update queue grows too big, then the Tag Processor
* will spend more time iterating through them and lose the
* efficiency gains of deferring applying them.
*/
if ( 1000 < count( $this->lexical_updates ) ) {
$this->get_updated_html();
}
foreach ( $this->lexical_updates as $name => $update ) {
/*
* Any updates appearing after the cursor should be applied
* before proceeding, otherwise they may be overlooked.
*/
if ( $update->start >= $this->bytes_already_parsed ) {
$this->get_updated_html();
break;
}
if ( is_int( $name ) ) {
continue;
}
$this->lexical_updates[] = $update;
unset( $this->lexical_updates[ $name ] );
}
$this->token_starts_at = null; $this->token_starts_at = null;
$this->token_length = null; $this->token_length = null;
$this->tag_name_starts_at = null; $this->tag_name_starts_at = null;
@ -2230,7 +2288,7 @@ class WP_HTML_Tag_Processor {
$shift = strlen( $diff->text ) - $diff->length; $shift = strlen( $diff->text ) - $diff->length;
// Adjust the cursor position by however much an update affects it. // Adjust the cursor position by however much an update affects it.
if ( $diff->start <= $this->bytes_already_parsed ) { if ( $diff->start < $this->bytes_already_parsed ) {
$this->bytes_already_parsed += $shift; $this->bytes_already_parsed += $shift;
} }
@ -3164,15 +3222,7 @@ class WP_HTML_Tag_Processor {
* └←─┘ back up by strlen("em") + 1 ==> 3 * └←─┘ back up by strlen("em") + 1 ==> 3
*/ */
$this->bytes_already_parsed = $before_current_tag; $this->bytes_already_parsed = $before_current_tag;
$this->parse_next_tag(); $this->base_class_next_token();
// Reparse the attributes.
while ( $this->parse_next_attribute() ) {
continue;
}
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
$this->token_length = $tag_ends_at - $this->token_starts_at;
$this->bytes_already_parsed = $tag_ends_at;
return $this->html; return $this->html;
} }

View File

@ -16,7 +16,7 @@
* *
* @global string $wp_version * @global string $wp_version
*/ */
$wp_version = '6.5-RC1-57813'; $wp_version = '6.5-RC1-57815';
/** /**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.