HTML API: Allow subdividing text nodes by meaningful prefixes.

HTML parsing rules at times differentiate character tokens that are all null bytes, all whitespace, or other content. This patch introduces a new function which may be used to classify text node sub-regions and lead to more efficient application of these parsing rules. Further, when classified in this way, application code may skip some rules and decoding entirely, improving performance. For example, this can be used to ease the implementation of skipping inter-element whitespace, which is usually not rendered. Developed in https://github.com/WordPress/wordpress-develop/pull/7236 Discussed in https://core.trac.wordpress.org/ticket/61974 Props dmsnell, jonsurrell. Fixes #61974. Built from https://develop.svn.wordpress.org/trunk@58970 git-svn-id: http://core.svn.wordpress.org/trunk@58366 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2024-09-02 23:21:18 +00:00 · 2024-09-02 23:21:18 +00:00 · 8d2a10b300
parent 3008472ffb
commit 8d2a10b300
3 changed files with 183 additions and 74 deletions
--- a/wp-includes/html-api/class-wp-html-processor.php
+++ b/wp-includes/html-api/class-wp-html-processor.php
@ -843,6 +843,12 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {

 		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
 			parent::next_token();
+			if (
+				WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
+				WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
+			) {
+				parent::subdivide_text_appropriately();
+			}
 		}

 		// Finish stepping when there are no more tokens in the document.
@ -1056,8 +1062,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step();
 				}
 				goto initial_anything_else;
@ -1145,8 +1150,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step();
 				}
 				goto before_html_anything_else;
@ -1227,8 +1231,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step();
 				}
 				goto before_head_anything_else;
@ -1323,16 +1326,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
 				 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 				 */
-				$text = $this->get_modifiable_text();
-				if ( '' === $text ) {
-					/*
-					 * If the text is empty after processing HTML entities and stripping
-					 * U+0000 NULL bytes then ignore the token.
-					 */
-					return $this->step();
-				}
-
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					// Insert the character.
 					$this->insert_html_element( $this->state->current_token );
 					return true;
@ -1552,8 +1546,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_head();
 				}

@ -1654,8 +1647,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					// Insert the character.
 					$this->insert_html_element( $this->state->current_token );
 					return true;
@ -1793,8 +1785,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {

 		switch ( $op ) {
 			case '#text':
-				$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
-
 				/*
 				 * > A character token that is U+0000 NULL
 				 *
@ -1804,11 +1794,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				 * here, but if there are any other characters in the stream
 				 * the active formats should be reconstructed.
 				 */
-				if (
-					1 <= $current_token->length &&
-					"\x00" === $this->html[ $current_token->start ] &&
-					strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
-				) {
+				if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
 					// Parse error: ignore the token.
 					return $this->step();
 				}
@ -1820,8 +1806,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				 * It is probably inter-element whitespace, but it may also
 				 * contain character references which decode only to whitespace.
 				 */
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
 					$this->state->frameset_ok = false;
 				}

@ -2829,12 +2814,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 						'TR' === $current_node_name
 					)
 				) {
-					$text = $this->get_modifiable_text();
 					/*
 					 * If the text is empty after processing HTML entities and stripping
 					 * U+0000 NULL bytes then ignore the token.
 					 */
-					if ( '' === $text ) {
+					if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
 						return $this->step();
 					}

@ -2857,7 +2841,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 					 *
 					 * @see https://html.spec.whatwg.org/#parsing-main-intabletext
 					 */
-					if ( strlen( $text ) === strspn( $text, " \t\f\r\n" ) ) {
+					if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 						$this->insert_html_element( $this->state->current_token );
 						return true;
 					}
@ -3177,16 +3161,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( '' === $text ) {
-					/*
-					 * If the text is empty after processing HTML entities and stripping
-					 * U+0000 NULL bytes then ignore the token.
-					 */
-					return $this->step();
-				}
-
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					// Insert the character.
 					$this->insert_html_element( $this->state->current_token );
 					return true;
@ -3609,19 +3584,13 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > Any other character token
 			 */
 			case '#text':
-				$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
-
 				/*
 				 * > A character token that is U+0000 NULL
 				 *
 				 * If a text node only comprises null bytes then it should be
 				 * entirely ignored and should not return to calling code.
 				 */
-				if (
-					1 <= $current_token->length &&
-					"\x00" === $this->html[ $current_token->start ] &&
-					strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
-				) {
+				if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
 					// Parse error: ignore the token.
 					return $this->step();
 				}
@ -3986,8 +3955,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > Process the token using the rules for the "in body" insertion mode.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				goto after_body_anything_else;
@ -4072,9 +4040,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * them under HTML. This is not supported at this time.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
@ -4193,8 +4159,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * them under HTML. This is not supported at this time.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
@ -4288,8 +4253,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * > Process the token using the rules for the "in body" insertion mode.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				goto after_after_body_anything_else;
@ -4355,8 +4319,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 * them under HTML. This is not supported at this time.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
@ -4412,6 +4375,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		}

 		switch ( $op ) {
+			case '#cdata-section':
 			case '#text':
 				/*
 				 * > A character token that is U+0000 NULL
@ -4424,8 +4388,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				 * It is probably inter-element whitespace, but it may also
 				 * contain character references which decode only to whitespace.
 				 */
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
 					$this->state->frameset_ok = false;
 				}

@ -4435,7 +4398,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			/*
 			 * > A comment token
 			 */
-			case '#cdata-section':
 			case '#comment':
 			case '#funky-comment':
 			case '#presumptuous-tag':
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@ -541,6 +541,20 @@ class WP_HTML_Tag_Processor {
 	 */
 	protected $comment_type = null;

+	/**
+	 * What kind of text the matched text node represents, if it was subdivided.
+	 *
+	 * @see self::TEXT_IS_NULL_SEQUENCE
+	 * @see self::TEXT_IS_WHITESPACE
+	 * @see self::TEXT_IS_GENERIC
+	 * @see self::subdivide_text_appropriately
+	 *
+	 * @since 6.7.0
+	 *
+	 * @var string
+	 */
+	protected $text_node_classification = self::TEXT_IS_GENERIC;
+
 	/**
 	 * How many bytes from the original HTML document have been read and parsed.
 	 *
@ -2199,16 +2213,17 @@ class WP_HTML_Tag_Processor {
 			unset( $this->lexical_updates[ $name ] );
 		}

-		$this->token_starts_at      = null;
-		$this->token_length         = null;
-		$this->tag_name_starts_at   = null;
-		$this->tag_name_length      = null;
-		$this->text_starts_at       = 0;
-		$this->text_length          = 0;
-		$this->is_closing_tag       = null;
-		$this->attributes           = array();
-		$this->comment_type         = null;
-		$this->duplicate_attributes = null;
+		$this->token_starts_at          = null;
+		$this->token_length             = null;
+		$this->tag_name_starts_at       = null;
+		$this->tag_name_length          = null;
+		$this->text_starts_at           = 0;
+		$this->text_length              = 0;
+		$this->is_closing_tag           = null;
+		$this->attributes               = array();
+		$this->comment_type             = null;
+		$this->text_node_classification = self::TEXT_IS_GENERIC;
+		$this->duplicate_attributes     = null;
 	}

 	/**
@ -3321,6 +3336,107 @@ class WP_HTML_Tag_Processor {
 		return $this->comment_type;
 	}

+	/**
+	 * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences
+	 * and decoded whitespace as distinct prefixes.
+	 *
+	 * Note that once anything that's neither a NULL byte nor decoded whitespace is
+	 * encountered, then the remainder of the text node is left intact as generic text.
+	 *
+	 *  - The HTML Processor uses this to apply distinct rules for different kinds of text.
+	 *  - Inter-element whitespace can be detected and skipped with this method.
+	 *
+	 * Text nodes aren't eagerly subdivided because there's no need to split them unless
+	 * decisions are being made on NULL byte sequences or whitespace-only text.
+	 *
+	 * Example:
+	 *
+	 *     $processor = new WP_HTML_Tag_Processor( "\x00Apples & Oranges" );
+	 *     true  === $processor->next_token();                   // Text is "Apples & Oranges".
+	 *     true  === $processor->subdivide_text_appropriately(); // Text is "".
+	 *     true  === $processor->next_token();                   // Text is "Apples & Oranges".
+	 *     false === $processor->subdivide_text_appropriately();
+	 *
+	 *     $processor = new WP_HTML_Tag_Processor( "&#x13; \r\n\tMore" );
+	 *     true  === $processor->next_token();                   // Text is "␤ ␤␉More".
+	 *     true  === $processor->subdivide_text_appropriately(); // Text is "␤ ␤␉".
+	 *     true  === $processor->next_token();                   // Text is "More".
+	 *     false === $processor->subdivide_text_appropriately();
+	 *
+	 * @since 6.7.0
+	 *
+	 * @return bool Whether the text node was subdivided.
+	 */
+	public function subdivide_text_appropriately(): bool {
+		$this->text_node_classification = self::TEXT_IS_GENERIC;
+
+		if ( self::STATE_TEXT_NODE === $this->parser_state ) {
+			/*
+			 * NULL bytes are treated categorically different than numeric character
+			 * references whose number is zero. `&#x00;` is not the same as `"\x00"`.
+			 */
+			$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
+			if ( $leading_nulls > 0 ) {
+				$this->token_length             = $leading_nulls;
+				$this->text_length              = $leading_nulls;
+				$this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
+				$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
+				return true;
+			}
+
+			/*
+			 * Start a decoding loop to determine the point at which the
+			 * text subdivides. This entails raw whitespace bytes and any
+			 * character reference that decodes to the same.
+			 */
+			$at  = $this->text_starts_at;
+			$end = $this->text_starts_at + $this->text_length;
+			while ( $at < $end ) {
+				$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
+				$at     += $skipped;
+
+				if ( $at < $end && '&' === $this->html[ $at ] ) {
+					$matched_byte_length = null;
+					$replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
+					if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
+						$at += $matched_byte_length;
+						continue;
+					}
+				}
+
+				break;
+			}
+
+			if ( $at > $this->text_starts_at ) {
+				$new_length                     = $at - $this->text_starts_at;
+				$this->text_length              = $new_length;
+				$this->token_length             = $new_length;
+				$this->bytes_already_parsed     = $at;
+				$this->text_node_classification = self::TEXT_IS_WHITESPACE;
+				return true;
+			}
+
+			return false;
+		}
+
+		// Unlike text nodes, there are no character references within CDATA sections.
+		if ( self::STATE_CDATA_NODE === $this->parser_state ) {
+			$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
+			if ( $leading_nulls === $this->text_length ) {
+				$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
+				return true;
+			}
+
+			$leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length );
+			if ( $leading_ws === $this->text_length ) {
+				$this->text_node_classification = self::TEXT_IS_WHITESPACE;
+				return true;
+			}
+		}
+
+		return false;
+	}
+
 	/**
 	 * Returns the modifiable text for a matched token, or an empty string.
 	 *
@ -4248,4 +4364,35 @@ class WP_HTML_Tag_Processor {
 	 * @since 6.5.0
 	 */
 	const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
+
+	/**
+	 * Indicates that a span of text may contain any combination of significant
+	 * kinds of characters: NULL bytes, whitespace, and others.
+	 *
+	 * @see self::$text_node_classification
+	 * @see self::subdivide_text_appropriately
+	 *
+	 * @since 6.7.0
+	 */
+	const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC';
+
+	/**
+	 * Indicates that a span of text comprises a sequence only of NULL bytes.
+	 *
+	 * @see self::$text_node_classification
+	 * @see self::subdivide_text_appropriately
+	 *
+	 * @since 6.7.0
+	 */
+	const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE';
+
+	/**
+	 * Indicates that a span of decoded text comprises only whitespace.
+	 *
+	 * @see self::$text_node_classification
+	 * @see self::subdivide_text_appropriately
+	 *
+	 * @since 6.7.0
+	 */
+	const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE';
 }
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -16,7 +16,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '6.7-alpha-58969';
+$wp_version = '6.7-alpha-58970';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.