HTML API: Optimize low-level parsing details in Tag Processor.

Introduces a number of micro-level optimizations in the Tag Processor to improve token-scanning performance. Should contain no functional changes. Based on benchmarking against a list of the 100 most-visited websites, these changes result in an average improvement in performance of the Tag Processor for scanning tags from between 3.5% and 7.5%. Developed in https://github.com/WordPress/wordpress-develop/pull/6890 Discussed in https://core.trac.wordpress.org/ticket/61545 Follow-up to [55203]. See #61545. Built from https://develop.svn.wordpress.org/trunk@58613 git-svn-id: http://core.svn.wordpress.org/trunk@58046 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2024-07-01 23:36:15 +00:00 · 2024-07-01 23:36:15 +00:00 · df598e1d98
parent 8a4deae8f8
commit df598e1d98
3 changed files with 53 additions and 93 deletions
--- a/wp-includes/html-api/class-wp-html-decoder.php
+++ b/wp-includes/html-api/class-wp-html-decoder.php
@ -141,7 +141,7 @@ class WP_HTML_Decoder {
 		while ( $at < $end ) {
 			$next_character_reference_at = strpos( $text, '&', $at );
-			if ( false === $next_character_reference_at || $next_character_reference_at >= $end ) {
+			if ( false === $next_character_reference_at ) {
 				break;
 			}
@ -436,26 +436,26 @@ class WP_HTML_Decoder {
 		}
 		if ( $code_point <= 0x7FF ) {
-			$byte1 = ( $code_point >> 6 ) | 0xC0;
+			$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
-			$byte2 = $code_point & 0x3F | 0x80;
+			$byte2 = chr( $code_point & 0x3F | 0x80 );
-			return pack( 'CC', $byte1, $byte2 );
+			return "{$byte1}{$byte2}";
 		}
 		if ( $code_point <= 0xFFFF ) {
-			$byte1 = ( $code_point >> 12 ) | 0xE0;
+			$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
-			$byte2 = ( $code_point >> 6 ) & 0x3F | 0x80;
+			$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
-			$byte3 = $code_point & 0x3F | 0x80;
+			$byte3 = chr( $code_point & 0x3F | 0x80 );
-			return pack( 'CCC', $byte1, $byte2, $byte3 );
+			return "{$byte1}{$byte2}{$byte3}";
 		}
 		// Any values above U+10FFFF are eliminated above in the pre-check.
-		$byte1 = ( $code_point >> 18 ) | 0xF0;
+		$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
-		$byte2 = ( $code_point >> 12 ) & 0x3F | 0x80;
+		$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
-		$byte3 = ( $code_point >> 6 ) & 0x3F | 0x80;
+		$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
-		$byte4 = $code_point & 0x3F | 0x80;
+		$byte4 = chr( $code_point & 0x3F | 0x80 );
-		return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 );
+		return "{$byte1}{$byte2}{$byte3}{$byte4}";
 	}
 }
--- a/wp-includes/html-api/class-wp-html-tag-processor.php
+++ b/wp-includes/html-api/class-wp-html-tag-processor.php
@ -1524,21 +1524,10 @@ class WP_HTML_Tag_Processor {
 		$was_at     = $this->bytes_already_parsed;
 		$at         = $was_at;
-		while ( false !== $at && $at < $doc_length ) {
+		while ( $at < $doc_length ) {
 			$at = strpos( $html, '<', $at );
 			/*
 			 * This does not imply an incomplete parse; it indicates that there
 			 * can be nothing left in the document other than a #text node.
 			 */
 			if ( false === $at ) {
-				$this->parser_state         = self::STATE_TEXT_NODE;
+				break;
 				$this->token_starts_at      = $was_at;
 				$this->token_length         = strlen( $html ) - $was_at;
 				$this->text_starts_at       = $was_at;
 				$this->text_length          = $this->token_length;
 				$this->bytes_already_parsed = strlen( $html );
 				return true;
 			}
 			if ( $at > $was_at ) {
@ -1554,19 +1543,9 @@ class WP_HTML_Tag_Processor {
 				 *
 				 * @see https://html.spec.whatwg.org/#tag-open-state
 				 */
-				if ( strlen( $html ) > $at + 1 ) {
+				if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
-					$next_character  = $html[ $at + 1 ];
+					++$at;
-					$at_another_node = (
+					continue;
 						'!' === $next_character ||
 						'/' === $next_character ||
 						'?' === $next_character ||
 						( 'A' <= $next_character && $next_character <= 'Z' ) ||
 						( 'a' <= $next_character && $next_character <= 'z' )
 					);
 					if ( ! $at_another_node ) {
 						++$at;
 						continue;
 					}
 				}
 				$this->parser_state         = self::STATE_TEXT_NODE;
@ -1630,11 +1609,7 @@ class WP_HTML_Tag_Processor {
 				 * `<!--` transitions to a comment state – apply further comment rules.
 				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 				 */
-				if (
+				if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
 					$doc_length > $at + 3 &&
 					'-' === $html[ $at + 2 ] &&
 					'-' === $html[ $at + 3 ]
 				) {
 					$closer_at = $at + 4;
 					// If it's not possible to close the comment then there is nothing more to scan.
 					if ( $doc_length <= $closer_at ) {
@ -1911,7 +1886,17 @@ class WP_HTML_Tag_Processor {
 			++$at;
 		}
-		return false;
+		/*
 		 * This does not imply an incomplete parse; it indicates that there
 		 * can be nothing left in the document other than a #text node.
 		 */
 		$this->parser_state         = self::STATE_TEXT_NODE;
 		$this->token_starts_at      = $was_at;
 		$this->token_length         = $doc_length - $was_at;
 		$this->text_starts_at       = $was_at;
 		$this->text_length          = $this->token_length;
 		$this->bytes_already_parsed = $doc_length;
 		return true;
 	}
 	/**
@ -1922,9 +1907,11 @@ class WP_HTML_Tag_Processor {
 	 * @return bool Whether an attribute was found before the end of the document.
 	 */
 	private function parse_next_attribute() {
 		$doc_length = strlen( $this->html );
 		// Skip whitespace and slashes.
 		$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+		if ( $this->bytes_already_parsed >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 			return false;
@ -1941,21 +1928,21 @@ class WP_HTML_Tag_Processor {
 			: strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );
 		// No attribute, just tag closer.
-		if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
+		if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
 			return false;
 		}
 		$attribute_start             = $this->bytes_already_parsed;
 		$attribute_name              = substr( $this->html, $attribute_start, $name_length );
 		$this->bytes_already_parsed += $name_length;
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+		if ( $this->bytes_already_parsed >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 			return false;
 		}
 		$this->skip_whitespace();
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+		if ( $this->bytes_already_parsed >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 			return false;
@ -1965,7 +1952,7 @@ class WP_HTML_Tag_Processor {
 		if ( $has_value ) {
 			++$this->bytes_already_parsed;
 			$this->skip_whitespace();
-			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			if ( $this->bytes_already_parsed >= $doc_length ) {
 				$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 				return false;
@ -1976,8 +1963,10 @@ class WP_HTML_Tag_Processor {
 				case '"':
 					$quote                      = $this->html[ $this->bytes_already_parsed ];
 					$value_start                = $this->bytes_already_parsed + 1;
-					$value_length               = strcspn( $this->html, $quote, $value_start );
+					$end_quote_at               = strpos( $this->html, $quote, $value_start );
-					$attribute_end              = $value_start + $value_length + 1;
+					$end_quote_at               = false === $end_quote_at ? $doc_length : $end_quote_at;
 					$value_length               = $end_quote_at - $value_start;
 					$attribute_end              = $end_quote_at + 1;
 					$this->bytes_already_parsed = $attribute_end;
 					break;
@ -1993,7 +1982,7 @@ class WP_HTML_Tag_Processor {
 			$attribute_end = $attribute_start + $name_length;
 		}
-		if ( $attribute_end >= strlen( $this->html ) ) {
+		if ( $attribute_end >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 			return false;
@ -2014,7 +2003,7 @@ class WP_HTML_Tag_Processor {
 		$comparable_name = strtolower( $attribute_name );
 		// If an attribute is listed many times, only use the first declaration and ignore the rest.
-		if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
+		if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
 			$this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
 				$attribute_name,
 				$value_start,
@ -2038,7 +2027,7 @@ class WP_HTML_Tag_Processor {
 		$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
 		if ( null === $this->duplicate_attributes ) {
 			$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
-		} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
+		} elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
 			$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
 		} else {
 			$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
@ -3110,14 +3099,12 @@ class WP_HTML_Tag_Processor {
 		);
 		// Removes any duplicated attributes if they were also present.
-		if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
+		foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
-			foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
-				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$attribute_token->start,
-					$attribute_token->start,
+				$attribute_token->length,
-					$attribute_token->length,
+				''
-					''
+			);
 				);
 			}
 		}
 		return true;
@ -3317,35 +3304,8 @@ class WP_HTML_Tag_Processor {
 		}
 		// Does the tag name match the requested tag name in a case-insensitive manner?
-		if ( null !== $this->sought_tag_name ) {
+		if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
-			/*
+			return false;
 			 * String (byte) length lookup is fast. If they aren't the
 			 * same length then they can't be the same string values.
 			 */
 			if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
 				return false;
 			}
 			/*
 			 * Check each character to determine if they are the same.
 			 * Defer calls to `strtoupper()` to avoid them when possible.
 			 * Calling `strcasecmp()` here tested slowed than comparing each
 			 * character, so unless benchmarks show otherwise, it should
 			 * not be used.
 			 *
 			 * It's expected that most of the time that this runs, a
 			 * lower-case tag name will be supplied and the input will
 			 * contain lower-case tag names, thus normally bypassing
 			 * the case comparison code.
 			 */
 			for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
 				$html_char = $this->html[ $this->tag_name_starts_at + $i ];
 				$tag_char  = $this->sought_tag_name[ $i ];
 				if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
 					return false;
 				}
 			}
 		}
 		if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -16,7 +16,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '6.7-alpha-58612';
+$wp_version = '6.7-alpha-58613';
 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.