WPDB: When checking that a string can be sent to MySQL, we shouldn't use `mb_convert_encoding()`, as it behaves differently to MySQL's character encoding conversion.

Merge of [32364] to the 4.1 branch.

Props mdawaffe, pento, nbachiyski, jorbin, johnjamesjacoby, jeremyfelt.

See #32165.

Built from https://develop.svn.wordpress.org/branches/4.1@32387


git-svn-id: http://core.svn.wordpress.org/branches/4.1@32357 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
Michael Adams 2015-05-06 19:07:25 +00:00
parent f70e3c08da
commit bb96e5b686
4 changed files with 254 additions and 79 deletions

View File

@ -442,8 +442,8 @@ function upgrade_all() {
if ( $wp_current_db_version < 29630 ) if ( $wp_current_db_version < 29630 )
upgrade_400(); upgrade_400();
if ( $wp_current_db_version < 30134 ) if ( $wp_current_db_version < 30135 )
upgrade_414(); upgrade_415();
maybe_disable_link_manager(); maybe_disable_link_manager();
@ -1334,22 +1334,46 @@ function upgrade_400() {
/** /**
* Execute changes made in WordPress 4.1.4. * Execute changes made in WordPress 4.1.4.
* *
* @since 4.1.3 * @since 4.1.4
*/ */
function upgrade_414() { function upgrade_414() {
}
/**
* Execute changes made in WordPress 4.1.5.
*
* @since 4.1.5
*/
function upgrade_415() {
global $wp_current_db_version, $wpdb; global $wp_current_db_version, $wpdb;
if ( $wp_current_db_version < 30134 ) { if ( $wp_current_db_version < 30135 ) {
$content_length = $wpdb->get_col_length( $wpdb->comments, 'comment_content' ); $content_length = $wpdb->get_col_length( $wpdb->comments, 'comment_content' );
if ( ! $content_length ) { if ( false === $content_length ) {
$content_length = 65535; $content_length = array(
'type' => 'byte',
'length' => 65535,
);
} elseif ( ! is_array( $content_length ) ) {
$length = (int) $content_length > 0 ? (int) $content_length : 65535;
$content_length = array(
'type' => 'byte',
'length' => $length
);
} }
if ( 'byte' !== $content_length['type'] ) {
// Sites with malformed DB schemas are on their own.
return;
}
$allowed_length = intval( $content_length['length'] ) - 10;
$comments = $wpdb->get_results( $comments = $wpdb->get_results(
"SELECT comment_ID FROM $wpdb->comments "SELECT `comment_ID` FROM `{$wpdb->comments}`
WHERE comment_date_gmt > '2015-04-26' WHERE `comment_date_gmt` > '2015-04-26'
AND CHAR_LENGTH( comment_content ) >= $content_length AND LENGTH( `comment_content` ) >= {$allowed_length}
AND ( comment_content LIKE '%<%' OR comment_content LIKE '%>%' )" AND ( `comment_content` LIKE '%<%' OR `comment_content` LIKE '%>%' )"
); );
foreach ( $comments as $comment ) { foreach ( $comments as $comment ) {

View File

@ -13,23 +13,141 @@ if ( !function_exists('_') ) {
} }
} }
if ( !function_exists('mb_substr') ): /**
function mb_substr( $str, $start, $length=null, $encoding=null ) { * Returns whether PCRE/u (PCRE_UTF8 modifier) is available for use.
return _mb_substr($str, $start, $length, $encoding); *
* @ignore
* @since 4.2.2
* @access private
*
* @param bool $set - Used for testing only
* null : default - get PCRE/u capability
* false : Used for testing - return false for future calls to this function
* 'reset': Used for testing - restore default behavior of this function
*/
function _wp_can_use_pcre_u( $set = null ) {
static $utf8_pcre = 'reset';
if ( null !== $set ) {
$utf8_pcre = $set;
}
if ( 'reset' === $utf8_pcre ) {
$utf8_pcre = @preg_match( '/^./u', 'a' );
}
return $utf8_pcre;
}
if ( ! function_exists( 'mb_substr' ) ) :
function mb_substr( $str, $start, $length = null, $encoding = null ) {
return _mb_substr( $str, $start, $length, $encoding );
} }
endif; endif;
function _mb_substr( $str, $start, $length=null, $encoding=null ) { /*
// the solution below, works only for utf-8, so in case of a different * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
// charset, just use built-in substr * For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
$charset = get_option( 'blog_charset' ); * The behavior of this function for invalid inputs is undefined.
if ( !in_array( $charset, array('utf8', 'utf-8', 'UTF8', 'UTF-8') ) ) { */
return is_null( $length )? substr( $str, $start ) : substr( $str, $start, $length); function _mb_substr( $str, $start, $length = null, $encoding = null ) {
if ( null === $encoding ) {
$encoding = get_option( 'blog_charset' );
} }
// use the regex unicode support to separate the UTF-8 characters into an array
preg_match_all( '/./us', $str, $match ); // The solution below works only for UTF-8,
$chars = is_null( $length )? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length ); // so in case of a different charset just use built-in substr()
return implode( '', $chars ); if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
}
if ( _wp_can_use_pcre_u() ) {
// Use the regex unicode support to separate the UTF-8 characters into an array
preg_match_all( '/./us', $str, $match );
$chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
return implode( '', $chars );
}
$regex = '/(
[\x00-\x7F] # single-byte sequences 0xxxxxxx
| [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2
| [\xE1-\xEC][\x80-\xBF]{2}
| \xED[\x80-\x9F][\x80-\xBF]
| [\xEE-\xEF][\x80-\xBF]{2}
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
| [\xF1-\xF3][\x80-\xBF]{3}
| \xF4[\x80-\x8F][\x80-\xBF]{2}
)/x';
$chars = array( '' ); // Start with 1 element instead of 0 since the first thing we do is pop
do {
// We had some string left over from the last round, but we counted it in that last round.
array_pop( $chars );
// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
$chars = array_merge( $chars, $pieces );
} while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
return join( '', array_slice( $chars, $start, $length ) );
}
if ( ! function_exists( 'mb_strlen' ) ) :
function mb_strlen( $str, $encoding = null ) {
return _mb_strlen( $str, $encoding );
}
endif;
/*
* Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
* For $encoding === UTF-8, the $str input is expected to be a valid UTF-8 byte sequence.
* The behavior of this function for invalid inputs is undefined.
*/
function _mb_strlen( $str, $encoding = null ) {
if ( null === $encoding ) {
$encoding = get_option( 'blog_charset' );
}
// The solution below works only for UTF-8,
// so in case of a different charset just use built-in strlen()
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ) ) ) {
return strlen( $str );
}
if ( _wp_can_use_pcre_u() ) {
// Use the regex unicode support to separate the UTF-8 characters into an array
preg_match_all( '/./us', $str, $match );
return count( $match[0] );
}
$regex = '/(?:
[\x00-\x7F] # single-byte sequences 0xxxxxxx
| [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2
| [\xE1-\xEC][\x80-\xBF]{2}
| \xED[\x80-\x9F][\x80-\xBF]
| [\xEE-\xEF][\x80-\xBF]{2}
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
| [\xF1-\xF3][\x80-\xBF]{3}
| \xF4[\x80-\x8F][\x80-\xBF]{2}
)/x';
$count = 1; // Start at 1 instead of 0 since the first thing we do is decrement
do {
// We had some string left over from the last round, but we counted it in that last round.
$count--;
// Split by UTF-8 character, limit to 1000 characters (last array element will contain the rest of the string)
$pieces = preg_split( $regex, $str, 1000 );
// Increment
$count += count( $pieces );
} while ( $str = array_pop( $pieces ) ); // If there's anything left over, repeat the loop.
// Fencepost: preg_split() always returns one extra item in the array
return --$count;
} }
if ( !function_exists('hash_hmac') ): if ( !function_exists('hash_hmac') ):

View File

@ -11,7 +11,7 @@ $wp_version = '4.1.4';
* *
* @global int $wp_db_version * @global int $wp_db_version
*/ */
$wp_db_version = 30134; $wp_db_version = 30135;
/** /**
* Holds the TinyMCE version * Holds the TinyMCE version

View File

@ -1789,6 +1789,8 @@ class wpdb {
* @return int|false The number of rows affected, or false on error. * @return int|false The number of rows affected, or false on error.
*/ */
function _insert_replace_helper( $table, $data, $format = null, $type = 'INSERT' ) { function _insert_replace_helper( $table, $data, $format = null, $type = 'INSERT' ) {
$this->insert_id = 0;
if ( ! in_array( strtoupper( $type ), array( 'REPLACE', 'INSERT' ) ) ) { if ( ! in_array( strtoupper( $type ), array( 'REPLACE', 'INSERT' ) ) ) {
return false; return false;
} }
@ -1809,7 +1811,6 @@ class wpdb {
$sql = "$type INTO `$table` ($fields) VALUES ($formats)"; $sql = "$type INTO `$table` ($fields) VALUES ($formats)";
$this->insert_id = 0;
$this->check_current_query = false; $this->check_current_query = false;
return $this->query( $this->prepare( $sql, $values ) ); return $this->query( $this->prepare( $sql, $values ) );
} }
@ -2001,17 +2002,11 @@ class wpdb {
// We can skip this field if we know it isn't a string. // We can skip this field if we know it isn't a string.
// This checks %d/%f versus ! %s because it's sprintf() could take more. // This checks %d/%f versus ! %s because it's sprintf() could take more.
$value['charset'] = false; $value['charset'] = false;
} elseif ( $this->check_ascii( $value['value'] ) ) {
// If it's ASCII, then we don't need the charset. We can skip this field.
$value['charset'] = false;
} else { } else {
$value['charset'] = $this->get_col_charset( $table, $field ); $value['charset'] = $this->get_col_charset( $table, $field );
if ( is_wp_error( $value['charset'] ) ) { if ( is_wp_error( $value['charset'] ) ) {
return false; return false;
} }
// This isn't ASCII. Don't have strip_invalid_text() re-check.
$value['ascii'] = false;
} }
$data[ $field ] = $value; $data[ $field ] = $value;
@ -2044,10 +2039,6 @@ class wpdb {
} }
} }
if ( false !== $value['length'] && strlen( $value['value'] ) > $value['length'] ) {
return false;
}
$data[ $field ] = $value; $data[ $field ] = $value;
} }
@ -2380,14 +2371,16 @@ class wpdb {
/** /**
* Retrieve the maximum string length allowed in a given column. * Retrieve the maximum string length allowed in a given column.
* The length may either be specified as a byte length or a character length.
* *
* @since 4.2.1 * @since 4.2.1
* @access public * @access public
* *
* @param string $table Table name. * @param string $table Table name.
* @param string $column Column name. * @param string $column Column name.
* @return mixed Max column length as an int. False if the column has no * @return mixed array( 'length' => (int), 'type' => 'byte' | 'char' )
* length. WP_Error object if there was an error. * false if the column has no length (for example, numeric column)
* WP_Error object if there was an error.
*/ */
public function get_col_length( $table, $column ) { public function get_col_length( $table, $column ) {
$tablekey = strtolower( $table ); $tablekey = strtolower( $table );
@ -2420,27 +2413,47 @@ class wpdb {
} }
switch( $type ) { switch( $type ) {
case 'binary':
case 'char': case 'char':
case 'varbinary':
case 'varchar': case 'varchar':
return $length; return array(
'type' => 'char',
'length' => (int) $length,
);
break;
case 'binary':
case 'varbinary':
return array(
'type' => 'byte',
'length' => (int) $length,
);
break; break;
case 'tinyblob': case 'tinyblob':
case 'tinytext': case 'tinytext':
return 255; // 2^8 - 1 return array(
'type' => 'byte',
'length' => 255, // 2^8 - 1
);
break; break;
case 'blob': case 'blob':
case 'text': case 'text':
return 65535; // 2^16 - 1 return array(
'type' => 'byte',
'length' => 65535, // 2^16 - 1
);
break; break;
case 'mediumblob': case 'mediumblob':
case 'mediumtext': case 'mediumtext':
return 16777215; // 2^24 - 1 return array(
'type' => 'byte',
'length' => 16777215, // 2^24 - 1
);
break; break;
case 'longblob': case 'longblob':
case 'longtext': case 'longtext':
return 4294967295; // 2^32 - 1 return array(
'type' => 'byte',
'length' => 4294967295, // 2^32 - 1
);
break; break;
default: default:
return false; return false;
@ -2547,50 +2560,55 @@ class wpdb {
*/ */
// If any of the columns don't have one of these collations, it needs more sanity checking. // If any of the columns don't have one of these collations, it needs more sanity checking.
protected function strip_invalid_text( $data ) { protected function strip_invalid_text( $data ) {
// Some multibyte character sets that we can check in PHP.
$mb_charsets = array(
'ascii' => 'ASCII',
'big5' => 'BIG-5',
'eucjpms' => 'eucJP-win',
'gb2312' => 'EUC-CN',
'ujis' => 'EUC-JP',
'utf32' => 'UTF-32',
);
$supported_charsets = array();
if ( function_exists( 'mb_list_encodings' ) ) {
$supported_charsets = mb_list_encodings();
}
$db_check_string = false; $db_check_string = false;
foreach ( $data as &$value ) { foreach ( $data as &$value ) {
$charset = $value['charset']; $charset = $value['charset'];
// Column isn't a string, or is latin1, which will will happily store anything. if ( is_array( $value['length'] ) ) {
if ( false === $charset || 'latin1' === $charset ) { $length = $value['length']['length'];
} else {
$length = false;
}
// There's no charset to work with.
if ( false === $charset ) {
continue; continue;
} }
// Column isn't a string.
if ( ! is_string( $value['value'] ) ) { if ( ! is_string( $value['value'] ) ) {
continue; continue;
} }
// ASCII is always OK. $truncate_by_byte_length = 'byte' === $value['length']['type'];
if ( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) ) {
continue; $needs_validation = true;
if (
// latin1 can store any byte sequence
'latin1' === $charset
||
// ASCII is always OK.
( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) )
) {
$truncate_by_byte_length = true;
$needs_validation = false;
} }
// Convert the text locally. if ( $truncate_by_byte_length ) {
if ( $supported_charsets ) { mbstring_binary_safe_encoding();
if ( isset( $mb_charsets[ $charset ] ) && in_array( $mb_charsets[ $charset ], $supported_charsets ) ) { if ( false !== $length && strlen( $value['value'] ) > $length ) {
$value['value'] = mb_convert_encoding( $value['value'], $mb_charsets[ $charset ], $mb_charsets[ $charset ] ); $value['value'] = substr( $value['value'], 0, $length );
}
reset_mbstring_encoding();
if ( ! $needs_validation ) {
continue; continue;
} }
} }
// utf8 can be handled by regex, which is a bunch faster than a DB lookup. // utf8 can be handled by regex, which is a bunch faster than a DB lookup.
if ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) { if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) {
$regex = '/ $regex = '/
( (
(?: [\x00-\x7F] # single-byte sequences 0xxxxxxx (?: [\x00-\x7F] # single-byte sequences 0xxxxxxx
@ -2600,7 +2618,7 @@ class wpdb {
| \xED[\x80-\x9F][\x80-\xBF] | \xED[\x80-\x9F][\x80-\xBF]
| [\xEE-\xEF][\x80-\xBF]{2}'; | [\xEE-\xEF][\x80-\xBF]{2}';
if ( 'utf8mb4' === $charset) { if ( 'utf8mb4' === $charset ) {
$regex .= ' $regex .= '
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3 | \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
| [\xF1-\xF3][\x80-\xBF]{3} | [\xF1-\xF3][\x80-\xBF]{3}
@ -2613,6 +2631,11 @@ class wpdb {
| . # anything else | . # anything else
/x'; /x';
$value['value'] = preg_replace( $regex, '$1', $value['value'] ); $value['value'] = preg_replace( $regex, '$1', $value['value'] );
if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) {
$value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' );
}
continue; continue;
} }
@ -2629,8 +2652,14 @@ class wpdb {
$queries[ $value['charset'] ] = array(); $queries[ $value['charset'] ] = array();
} }
// Split the CONVERT() calls by charset, so we can make sure the connection is right // We're going to need to truncate by characters or bytes, depending on the length value we have.
$queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( %s USING {$value['charset']} )", $value['value'] ); if ( 'byte' === $value['length']['type'] ) {
// Split the CONVERT() calls by charset, so we can make sure the connection is right
$queries[ $value['charset'] ][ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING binary ), %d ) USING {$value['charset']} )", $value['value'], $value['length']['length'] );
} else {
$queries[ $value['charset'] ][ $col ] = $this->prepare( "LEFT( CONVERT( %s USING {$value['charset']} ), %d )", $value['value'], $value['length']['length'] );
}
unset( $data[ $col ]['db'] ); unset( $data[ $col ]['db'] );
} }
} }
@ -2649,16 +2678,19 @@ class wpdb {
$this->check_current_query = false; $this->check_current_query = false;
$row = $this->get_row( "SELECT " . implode( ', ', $query ), ARRAY_N ); $sql = array();
foreach ( $query as $column => $column_query ) {
$sql[] = $column_query . " AS x_$column";
}
$row = $this->get_row( "SELECT " . implode( ', ', $sql ), ARRAY_A );
if ( ! $row ) { if ( ! $row ) {
$this->set_charset( $this->dbh, $connection_charset ); $this->set_charset( $this->dbh, $connection_charset );
return new WP_Error( 'wpdb_strip_invalid_text_failure' ); return new WP_Error( 'wpdb_strip_invalid_text_failure' );
} }
$cols = array_keys( $query ); foreach ( array_keys( $query ) as $column ) {
$col_count = count( $cols ); $data[ $column ]['value'] = $row["x_$column"];
for ( $ii = 0; $ii < $col_count; $ii++ ) {
$data[ $cols[ $ii ] ]['value'] = $row[ $ii ];
} }
} }
@ -2700,6 +2732,7 @@ class wpdb {
'value' => $query, 'value' => $query,
'charset' => $charset, 'charset' => $charset,
'ascii' => false, 'ascii' => false,
'length' => false,
); );
$data = $this->strip_invalid_text( array( $data ) ); $data = $this->strip_invalid_text( array( $data ) );
@ -2722,7 +2755,7 @@ class wpdb {
* @return string|WP_Error The converted string, or a `WP_Error` object if the conversion fails. * @return string|WP_Error The converted string, or a `WP_Error` object if the conversion fails.
*/ */
public function strip_invalid_text_for_column( $table, $column, $value ) { public function strip_invalid_text_for_column( $table, $column, $value ) {
if ( ! is_string( $value ) || $this->check_ascii( $value ) ) { if ( ! is_string( $value ) ) {
return $value; return $value;
} }
@ -2739,7 +2772,7 @@ class wpdb {
$column => array( $column => array(
'value' => $value, 'value' => $value,
'charset' => $charset, 'charset' => $charset,
'ascii' => false, 'length' => $this->get_col_length( $table, $column ),
) )
); );