Shortcodes/Formatting: Add PCRE Performance Testing

* Move pattern from `wptexturize()` into a separate function. * Move pattern from `wp_html_split()` into a separate function. * Beautify code for `wp_html_split()`. * Remove unnecessary instances of `/s` modifier in patterns that don't use dots. * Add `tests/phpunit/data/formatting/whole-posts.php` for testing larger strings. * Add function `benchmark_pcre_backtracking()`. * Add tests for `wp_html_split()`. * Add tests for `wptexturize()`. * Add tests for `get_shortcode_regex()`. Props miqrogroove. Fixes #34121. Built from https://develop.svn.wordpress.org/trunk@34761 git-svn-id: http://core.svn.wordpress.org/trunk@34726 1a063a9b-81f0-0310-95a4-ce76da25c4cd
2015-10-02 04:26:25 +00:00 · 2015-10-02 04:26:25 +00:00 · 7a0f8602f3
parent da1c938a5c
commit 7a0f8602f3
3 changed files with 110 additions and 56 deletions
--- a/wp-includes/formatting.php
+++ b/wp-includes/formatting.php
@ -219,43 +219,8 @@ function wptexturize( $text, $reset = false ) {
 	preg_match_all( '@\[/?([^<>&/\[\]\x00-\x20]++)@', $text, $matches );
 	$tagnames = array_intersect( array_keys( $shortcode_tags ), $matches[1] );
 	$found_shortcodes = ! empty( $tagnames );
-	if ( $found_shortcodes ) {
-		$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
-		$tagregexp = "(?:$tagregexp)(?![\\w-])"; // Excerpt of get_shortcode_regex().
-		$shortcode_regex =
-			  '\['              // Find start of shortcode.
-			. '[\/\[]?'         // Shortcodes may begin with [/ or [[
-			. $tagregexp        // Only match registered shortcodes, because performance.
-			. '(?:'
-			.     '[^\[\]<>]+'  // Shortcodes do not contain other shortcodes. Quantifier critical.
-			. '|'
-			.     '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
-			. ')*+'             // Possessive critical.
-			. '\]'              // Find end of shortcode.
-			. '\]?';            // Shortcodes may end with ]]
-	}
-
-	$comment_regex =
-		  '!'           // Start of comment, after the <.
-		. '(?:'         // Unroll the loop: Consume everything until --> is found.
-		.     '-(?!->)' // Dash not followed by end of comment.
-		.     '[^\-]*+' // Consume non-dashes.
-		. ')*+'         // Loop possessively.
-		. '(?:-->)?';   // End of comment. If not found, match all input.
-
-	$html_regex =			 // Needs replaced with wp_html_split() per Shortcode API Roadmap.
-		  '<'                // Find start of element.
-		. '(?(?=!--)'        // Is this a comment?
-		.     $comment_regex // Find end of comment.
-		. '|'
-		.     '[^>]*>?'      // Find end of element. If not found, match all input.
-		. ')';
-
-	if ( $found_shortcodes ) {
-		$regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/s';
-	} else {
-		$regex = '/(' . $html_regex . ')/s';
-	}
+	$shortcode_regex = $found_shortcodes ? _get_wptexturize_shortcode_regex( $tagnames ) : '';
+	$regex = _get_wptexturize_split_regex( $shortcode_regex );

 	$textarr = preg_split( $regex, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );

@ -264,7 +229,7 @@ function wptexturize( $text, $reset = false ) {
 		$first = $curl[0];
 		if ( '<' === $first ) {
 			if ( '<!--' === substr( $curl, 0, 4 ) ) {
-				// This is an HTML comment delimeter.
+				// This is an HTML comment delimiter.
 				continue;
 			} else {
 				// This is an HTML element delimiter.
@ -615,6 +580,17 @@ function wpautop( $pee, $br = true ) {
 * @return array The formatted text.
 */
 function wp_html_split( $input ) {
+	return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
+}
+
+/**
+ * Retrieve the regular expression for an HTML element.
+ *
+ * @since 4.4.0
+ *
+ * @return string The regular expression
+ */
+function get_html_split_regex() {
 	static $regex;

 	if ( ! isset( $regex ) ) {
@ -635,22 +611,100 @@ function wp_html_split( $input ) {
 			. ')*+'         // Loop possessively.
 			. '(?:]]>)?';   // End of comment. If not found, match all input.

+		$escaped = 
+			  '(?='           // Is the element escaped?
+			.    '!--'
+			. '|'
+			.    '!\[CDATA\['
+			. ')'
+			. '(?(?=!-)'      // If yes, which type?
+			.     $comments
+			. '|'
+			.     $cdata
+			. ')';
+
 		$regex =
 			  '/('              // Capture the entire match.
 			.     '<'           // Find start of element.
-			.     '(?(?=!--)'   // Is this a comment?
-			.         $comments // Find end of comment.
-			.     '|'
-			.         '(?(?=!\[CDATA\[)' // Is this a comment?
-			.             $cdata // Find end of comment.
-			.         '|'
-			.             '[^>]*>?' // Find end of element. If not found, match all input.
-			.         ')'
+			.     '(?'          // Conditional expression follows.
+			.         $escaped  // Find end of escaped element.
+			.     '|'           // ... else ...
+			.         '[^>]*>?' // Find end of normal element.
 			.     ')'
-			. ')/s';
+			. ')/';
 	}

-	return preg_split( $regex, $input, -1, PREG_SPLIT_DELIM_CAPTURE );
+	return $regex;
+}
+
+/**
+ * Retrieve the combined regular expression for HTML and shortcodes.
+ *
+ * @access private
+ * @ignore
+ * @internal This function will be removed in 4.5.0 per Shortcode API Roadmap.
+ * @since 4.4.0
+ *
+ * @param string $shortcode_regex The result from _get_wptexturize_shortcode_regex().  Optional.
+ * @return string The regular expression
+ */
+function _get_wptexturize_split_regex( $shortcode_regex = '' ) {
+	static $html_regex;
+
+	if ( ! isset( $html_regex ) ) {
+		$comment_regex =
+			  '!'           // Start of comment, after the <.
+			. '(?:'         // Unroll the loop: Consume everything until --> is found.
+			.     '-(?!->)' // Dash not followed by end of comment.
+			.     '[^\-]*+' // Consume non-dashes.
+			. ')*+'         // Loop possessively.
+			. '(?:-->)?';   // End of comment. If not found, match all input.
+
+		$html_regex =			 // Needs replaced with wp_html_split() per Shortcode API Roadmap.
+			  '<'                // Find start of element.
+			. '(?(?=!--)'        // Is this a comment?
+			.     $comment_regex // Find end of comment.
+			. '|'
+			.     '[^>]*>?'      // Find end of element. If not found, match all input.
+			. ')';
+	}
+
+	if ( empty( $shortcode_regex ) ) {
+		$regex = '/(' . $html_regex . ')/';
+	} else {
+		$regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/';
+	}
+
+	return $regex;
+}
+
+/**
+ * Retrieve the regular expression for shortcodes.
+ *
+ * @access private
+ * @ignore
+ * @internal This function will be removed in 4.5.0 per Shortcode API Roadmap.
+ * @since 4.4.0
+ *
+ * @param array $tagnames List of shortcodes to find.
+ * @return string The regular expression
+ */
+function _get_wptexturize_shortcode_regex( $tagnames ) {
+	$tagregexp = join( '|', array_map( 'preg_quote', $tagnames ) );
+	$tagregexp = "(?:$tagregexp)(?=[\\s\\]\\/])"; // Excerpt of get_shortcode_regex().
+	$regex =
+		  '\['              // Find start of shortcode.
+		. '[\/\[]?'         // Shortcodes may begin with [/ or [[
+		. $tagregexp        // Only match registered shortcodes, because performance.
+		. '(?:'
+		.     '[^\[\]<>]+'  // Shortcodes do not contain other shortcodes. Quantifier critical.
+		. '|'
+		.     '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
+		. ')*+'             // Possessive critical.
+		. '\]'              // Find end of shortcode.
+		. '\]?';            // Shortcodes may end with ]]
+
+	return $regex;
 }

 /**
@ -768,7 +822,7 @@ function shortcode_unautop( $pee ) {
 		. ')'
 		. '(?:' . $spaces . ')*+'            // optional trailing whitespace
 		. '<\\/p>'                           // closing paragraph
-		. '/s';
+		. '/';

 	return preg_replace( $pattern, '$1', $pee );
 }
--- a/wp-includes/shortcodes.php
+++ b/wp-includes/shortcodes.php
@ -168,7 +168,7 @@ function has_shortcode( $content, $tag ) {
 	}

 	if ( shortcode_exists( $tag ) ) {
-		preg_match_all( '/' . get_shortcode_regex() . '/s', $content, $matches, PREG_SET_ORDER );
+		preg_match_all( '/' . get_shortcode_regex() . '/', $content, $matches, PREG_SET_ORDER );
 		if ( empty( $matches ) )
 			return false;

@ -219,7 +219,7 @@ function do_shortcode( $content, $ignore_html = false ) {
 	$content = do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames );

 	$pattern = get_shortcode_regex( $tagnames );
-	$content = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $content );
+	$content = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $content );

 	// Always restore square braces so we don't break things like <!--[if IE ]>
 	$content = unescape_invalid_shortcodes( $content );
@ -378,7 +378,7 @@ function do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames ) {
 		if ( false === $attributes ) {
 			// Some plugins are doing things like [name] <[email]>.
 			if ( 1 === preg_match( '%^<\s*\[\[?[^\[\]]+\]%', $element ) ) {
-				$element = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $element );
+				$element = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $element );
 			}

 			// Looks like we found some crazy unfiltered HTML.  Skipping it for sanity.
@ -407,12 +407,12 @@ function do_shortcodes_in_html_tags( $content, $ignore_html, $tagnames ) {
 				// In this specific situation we assume KSES did not run because the input
 				// was written by an administrator, so we should avoid changing the output
 				// and we do not need to run KSES here.
-				$attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr );
+				$attr = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $attr );
 			} else {
 				// $attr like 'name = "[shortcode]"' or "name = '[shortcode]'"
 				// We do not know if $content was unfiltered. Assume KSES ran before shortcodes.
 				$count = 0;
-				$new_attr = preg_replace_callback( "/$pattern/s", 'do_shortcode_tag', $attr, -1, $count );
+				$new_attr = preg_replace_callback( "/$pattern/", 'do_shortcode_tag', $attr, -1, $count );
 				if ( $count > 0 ) {
 					// Sanitize the shortcode output using KSES.
 					$new_attr = wp_kses_one_attr( $new_attr, $elname );
@ -572,7 +572,7 @@ function strip_shortcodes( $content ) {
 	$content = do_shortcodes_in_html_tags( $content, true, $tagnames );

 	$pattern = get_shortcode_regex( $tagnames );
-	$content = preg_replace_callback( "/$pattern/s", 'strip_shortcode_tag', $content );
+	$content = preg_replace_callback( "/$pattern/", 'strip_shortcode_tag', $content );

 	// Always restore square braces so we don't break things like <!--[if IE ]>
 	$content = unescape_invalid_shortcodes( $content );
--- a/wp-includes/version.php
+++ b/wp-includes/version.php
@ -4,7 +4,7 @@
 *
 * @global string $wp_version
 */
-$wp_version = '4.4-alpha-34760';
+$wp_version = '4.4-alpha-34761';

 /**
 * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.