HTML API: Add set_modifiable_text() for replacing text nodes.

This patch introduces a new method, `set_modifiable_text()` to the Tag Processor, which makes it possible and safe to replace text nodes within an HTML document, performing the appropriate escaping. This method can be used in conjunction with other code to modify the text content of a document, and can be used for transforming HTML in a streaming fashion. Developed in https://github.com/wordpress/wordpress-develop/pull/7007 Discussed in https://core.trac.wordpress.org/ticket/61617 Props: dmsnell, gziolo, zieladam. Fixes #61617. Built from https://develop.svn.wordpress.org/trunk@58829 git-svn-id: http://core.svn.wordpress.org/trunk@58225 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2024-07-29 17:59:12 +00:00 · 2024-07-29 17:59:12 +00:00 · aaa6658da9
parent d8ab99f84b
commit aaa6658da9
2 changed files with 155 additions and 2 deletions
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@ -2889,7 +2889,9 @@ class WP_HTML_Tag_Processor {
 			return '';
 		}

-		$text = substr( $this->html, $this->text_starts_at, $this->text_length );
+		$text = isset( $this->lexical_updates['modifiable text'] )
+			? $this->lexical_updates['modifiable text']->text
+			: substr( $this->html, $this->text_starts_at, $this->text_length );

 		/*
 		 * Pre-processing the input stream would normally happen before
@ -2956,6 +2958,157 @@ class WP_HTML_Tag_Processor {
 			: str_replace( "\x00", "\u{FFFD}", $decoded );
 	}

+	/**
+	 * Sets the modifiable text for the matched token, if matched.
+	 *
+	 * Modifiable text is text content that may be read and changed without
+	 * changing the HTML structure of the document around it. This includes
+	 * the contents of `#text` nodes in the HTML as well as the inner
+	 * contents of HTML comments, Processing Instructions, and others, even
+	 * though these nodes aren't part of a parsed DOM tree. They also contain
+	 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+	 * other section in an HTML document which cannot contain HTML markup (DATA).
+	 *
+	 * Not all modifiable text may be set by this method, and not all content
+	 * may be set as modifiable text. In the case that this fails it will return
+	 * `false` indicating as much. For instance, it will not allow inserting the
+	 * string `</script` into a SCRIPT element, because the rules for escaping
+	 * that safely are complicated. Similarly, it will not allow setting content
+	 * into a comment which would prematurely terminate the comment.
+	 *
+	 * Example:
+	 *
+	 *     // Add a preface to all STYLE contents.
+	 *     while ( $processor->next_tag( 'STYLE' ) ) {
+	 *         $style = $processor->get_modifiable_text();
+	 *         $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" );
+	 *     }
+	 *
+	 *     // Replace smiley text with Emoji smilies.
+	 *     while ( $processor->next_token() ) {
+	 *         if ( '#text' !== $processor->get_token_name() ) {
+	 *             continue;
+	 *         }
+	 *
+	 *         $chunk = $processor->get_modifiable_text();
+	 *         if ( ! str_contains( $chunk, ':)' ) ) {
+	 *             continue;
+	 *         }
+	 *
+	 *         $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
+	 *     }
+	 *
+	 * @since 6.7.0
+	 *
+	 * @param string $plaintext_content New text content to represent in the matched token.
+	 *
+	 * @return bool Whether the text was able to update.
+	 */
+	public function set_modifiable_text( string $plaintext_content ): bool {
+		if ( self::STATE_TEXT_NODE === $this->parser_state ) {
+			$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
+				$this->text_starts_at,
+				$this->text_length,
+				htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
+			);
+
+			return true;
+		}
+
+		// Comment data is not encoded.
+		if (
+			self::STATE_COMMENT === $this->parser_state &&
+			self::COMMENT_AS_HTML_COMMENT === $this->comment_type
+		) {
+			// Check if the text could close the comment.
+			if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) {
+				return false;
+			}
+
+			$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
+				$this->text_starts_at,
+				$this->text_length,
+				$plaintext_content
+			);
+
+			return true;
+		}
+
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+			return false;
+		}
+
+		switch ( $this->get_tag() ) {
+			case 'SCRIPT':
+				/*
+				 * This is over-protective, but ensures the update doesn't break
+				 * out of the SCRIPT element. A more thorough check would need to
+				 * ensure that the script closing tag doesn't exist, and isn't
+				 * also "hidden" inside the script double-escaped state.
+				 *
+				 * It may seem like replacing `</script` with `<\/script` would
+				 * properly escape these things, but this could mask regex patterns
+				 * that previously worked. Resolve this by not sending `</script`
+				 */
+				if ( false !== stripos( $plaintext_content, '</script' ) ) {
+					return false;
+				}
+
+				$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+
+			case 'STYLE':
+				$plaintext_content = preg_replace_callback(
+					'~</(?P<TAG_NAME>style)~i',
+					static function ( $tag_match ) {
+						return "\\3c\\2f{$tag_match['TAG_NAME']}";
+					},
+					$plaintext_content
+				);
+
+				$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+
+			case 'TEXTAREA':
+			case 'TITLE':
+				$plaintext_content = preg_replace_callback(
+					"~</(?P<TAG_NAME>{$this->get_tag()})~i",
+					static function ( $tag_match ) {
+						return "&lt;/{$tag_match['TAG_NAME']}";
+					},
+					$plaintext_content
+				);
+
+				/*
+				 * These don't _need_ to be escaped, but since they are decoded it's
+				 * safe to leave them escaped and this can prevent other code from
+				 * naively detecting tags within the contents.
+				 *
+				 * @todo It would be useful to prefix a multiline replacement text
+				 *       with a newline, but not necessary. This is for aesthetics.
+				 */
+				$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+		}
+
+		return false;
+	}
+
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -16,7 +16,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '6.7-alpha-58828';
+$wp_version = '6.7-alpha-58829';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.