HTML API: Respect document compat mode when handling CSS class names.

The HTML API has been behaving as if CSS class name selectors matched class names in an ASCII case-insensitive manner. This is only true if the document in question is set to quirks mode. Unfortunately most documents processed will be set to no-quirks mode, meaning that some CSS behaviors have been matching incorrectly when provided with case variants of class names.

In this patch, the CSS methods have been audited and updated to adhere to the rules governing ASCII case sensitivity when matching classes. This includes `add_class()`, `remove_class()`, `has_class()`, and `class_list()`. Now, it is assumed that a document is in no-quirks mode unless a full HTML parser infers quirks mode, and these methods will treat class names in a byte-for-byte manner. Otherwise, when a document is in quirks mode, the methods will compare the provided class names against existing class names for the tag in an ASCII case insensitive way, while `class_list()` will return a lower-cased version of the existing class names.

The lower-casing in `class_list()` is performed for consistency, since it's possible that multiple case variants of the same comparable class name exists on a tag in the input HTML.

Developed in https://github.com/WordPress/wordpress-develop/pull/7169
Discussed in https://core.trac.wordpress.org/ticket/61531

Props dmsnell, jonsurrell.
See #61531.

Built from https://develop.svn.wordpress.org/trunk@58985


git-svn-id: http://core.svn.wordpress.org/trunk@58381 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
dmsnell 2024-09-04 04:34:15 +00:00
parent 1e6b860630
commit d0d8ba24d8
4 changed files with 151 additions and 79 deletions

View File

@ -299,31 +299,6 @@ class WP_HTML_Processor_State {
*/ */
const INSERTION_MODE_AFTER_AFTER_FRAMESET = 'insertion-mode-after-after-frameset'; const INSERTION_MODE_AFTER_AFTER_FRAMESET = 'insertion-mode-after-after-frameset';
/**
* No-quirks mode document compatability mode.
*
* > In no-quirks mode, the behavior is (hopefully) the desired behavior
* > described by the modern HTML and CSS specifications.
*
* @since 6.7.0
*
* @var string
*/
const NO_QUIRKS_MODE = 'no-quirks-mode';
/**
* Quirks mode document compatability mode.
*
* > In quirks mode, layout emulates behavior in Navigator 4 and Internet
* > Explorer 5. This is essential in order to support websites that were
* > built before the widespread adoption of web standards.
*
* @since 6.7.0
*
* @var string
*/
const QUIRKS_MODE = 'quirks-mode';
/** /**
* The stack of template insertion modes. * The stack of template insertion modes.
* *
@ -381,30 +356,6 @@ class WP_HTML_Processor_State {
*/ */
public $insertion_mode = self::INSERTION_MODE_INITIAL; public $insertion_mode = self::INSERTION_MODE_INITIAL;
/**
* Indicates if the document is in quirks mode or no-quirks mode.
*
* Impact on HTML parsing:
*
* - In `NO_QUIRKS_MODE` CSS class and ID selectors match in a byte-for-byte
* manner, otherwise for backwards compatability, class selectors are to
* match in an ASCII case-insensitive manner.
*
* - When not in `QUIRKS_MODE`, a TABLE start tag implicitly closes an open P tag
* if one is in scope and open, otherwise the TABLE becomes a child of the P.
*
* `QUIRKS_MODE` impacts many styling-related aspects of an HTML document, but
* none of the other changes modifies how the HTML is parsed or selected.
*
* @see self::QUIRKS_MODE
* @see self::NO_QUIRKS_MODE
*
* @since 6.7.0
*
* @var string
*/
public $document_mode = self::NO_QUIRKS_MODE;
/** /**
* Context node initializing fragment parser, if created as a fragment parser. * Context node initializing fragment parser, if created as a fragment parser.
* *

View File

@ -1080,7 +1080,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
case 'html': case 'html':
$doctype = $this->get_doctype_info(); $doctype = $this->get_doctype_info();
if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) { if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE; $this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
} }
/* /*
@ -1095,7 +1095,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > Anything else * > Anything else
*/ */
initial_anything_else: initial_anything_else:
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE; $this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
return $this->step( self::REPROCESS_CURRENT_NODE ); return $this->step( self::REPROCESS_CURRENT_NODE );
} }
@ -2448,7 +2448,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > has a p element in button scope, then close a p element. * > has a p element in button scope, then close a p element.
*/ */
if ( if (
WP_HTML_Processor_State::QUIRKS_MODE !== $this->state->document_mode && WP_HTML_Tag_Processor::QUIRKS_MODE !== $this->compat_mode &&
$this->state->stack_of_open_elements->has_p_in_button_scope() $this->state->stack_of_open_elements->has_p_in_button_scope()
) { ) {
$this->close_a_p_element(); $this->close_a_p_element();
@ -4938,6 +4938,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* *
* @since 6.6.0 Subclassed for the HTML Processor. * @since 6.6.0 Subclassed for the HTML Processor.
* *
* @todo When reconstructing active formatting elements with attributes, find a way
* to indicate if the virtually-reconstructed formatting elements contain the
* wanted class name.
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive. * @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched. * @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/ */

View File

@ -511,6 +511,32 @@ class WP_HTML_Tag_Processor {
*/ */
protected $parser_state = self::STATE_READY; protected $parser_state = self::STATE_READY;
/**
* Indicates if the document is in quirks mode or no-quirks mode.
*
* Impact on HTML parsing:
*
* - In `NO_QUIRKS_MODE` (also known as "standard mode"):
* - CSS class and ID selectors match byte-for-byte (case-sensitively).
* - A TABLE start tag `<table>` implicitly closes any open `P` element.
*
* - In `QUIRKS_MODE`:
* - CSS class and ID selectors match match in an ASCII case-insensitive manner.
* - A TABLE start tag `<table>` opens a `TABLE` element as a child of a `P`
* element if one is open.
*
* Quirks and no-quirks mode are thus mostly about styling, but have an impact when
* tables are found inside paragraph elements.
*
* @see self::QUIRKS_MODE
* @see self::NO_QUIRKS_MODE
*
* @since 6.7.0
*
* @var string
*/
protected $compat_mode = self::NO_QUIRKS_MODE;
/** /**
* Indicates whether the parser is inside foreign content, * Indicates whether the parser is inside foreign content,
* e.g. inside an SVG or MathML element. * e.g. inside an SVG or MathML element.
@ -1155,6 +1181,8 @@ class WP_HTML_Tag_Processor {
$seen = array(); $seen = array();
$is_quirks = self::QUIRKS_MODE === $this->compat_mode;
$at = 0; $at = 0;
while ( $at < strlen( $class ) ) { while ( $at < strlen( $class ) ) {
// Skip past any initial boundary characters. // Skip past any initial boundary characters.
@ -1169,13 +1197,11 @@ class WP_HTML_Tag_Processor {
return; return;
} }
/* $name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) );
* CSS class names are case-insensitive in the ASCII range. if ( $is_quirks ) {
* $name = strtolower( $name );
* @see https://www.w3.org/TR/CSS2/syndata.html#x1 }
*/ $at += $length;
$name = str_replace( "\x00", "\u{FFFD}", strtolower( substr( $class, $at, $length ) ) );
$at += $length;
/* /*
* It's expected that the number of class names for a given tag is relatively small. * It's expected that the number of class names for a given tag is relatively small.
@ -1205,10 +1231,14 @@ class WP_HTML_Tag_Processor {
return null; return null;
} }
$wanted_class = strtolower( $wanted_class ); $case_insensitive = self::QUIRKS_MODE === $this->compat_mode;
$wanted_length = strlen( $wanted_class );
foreach ( $this->class_list() as $class_name ) { foreach ( $this->class_list() as $class_name ) {
if ( $class_name === $wanted_class ) { if (
strlen( $class_name ) === $wanted_length &&
0 === substr_compare( $class_name, $wanted_class, 0, strlen( $wanted_class ), $case_insensitive )
) {
return true; return true;
} }
} }
@ -2296,6 +2326,23 @@ class WP_HTML_Tag_Processor {
*/ */
$modified = false; $modified = false;
$seen = array();
$to_remove = array();
$is_quirks = self::QUIRKS_MODE === $this->compat_mode;
if ( $is_quirks ) {
foreach ( $this->classname_updates as $updated_name => $action ) {
if ( self::REMOVE_CLASS === $action ) {
$to_remove[] = strtolower( $updated_name );
}
}
} else {
foreach ( $this->classname_updates as $updated_name => $action ) {
if ( self::REMOVE_CLASS === $action ) {
$to_remove[] = $updated_name;
}
}
}
// Remove unwanted classes by only copying the new ones. // Remove unwanted classes by only copying the new ones.
$existing_class_length = strlen( $existing_class ); $existing_class_length = strlen( $existing_class );
while ( $at < $existing_class_length ) { while ( $at < $existing_class_length ) {
@ -2311,25 +2358,23 @@ class WP_HTML_Tag_Processor {
break; break;
} }
$name = substr( $existing_class, $at, $name_length ); $name = substr( $existing_class, $at, $name_length );
$at += $name_length; $comparable_class_name = $is_quirks ? strtolower( $name ) : $name;
$at += $name_length;
// If this class is marked for removal, start processing the next one. // If this class is marked for removal, remove it and move on to the next one.
$remove_class = ( if ( in_array( $comparable_class_name, $to_remove, true ) ) {
isset( $this->classname_updates[ $name ] ) &&
self::REMOVE_CLASS === $this->classname_updates[ $name ]
);
// If a class has already been seen then skip it; it should not be added twice.
if ( ! $remove_class ) {
$this->classname_updates[ $name ] = self::SKIP_CLASS;
}
if ( $remove_class ) {
$modified = true; $modified = true;
continue; continue;
} }
// If a class has already been seen then skip it; it should not be added twice.
if ( in_array( $comparable_class_name, $seen, true ) ) {
continue;
}
$seen[] = $comparable_class_name;
/* /*
* Otherwise, append it to the new "class" attribute value. * Otherwise, append it to the new "class" attribute value.
* *
@ -2350,7 +2395,8 @@ class WP_HTML_Tag_Processor {
// Add new classes by appending those which haven't already been seen. // Add new classes by appending those which haven't already been seen.
foreach ( $this->classname_updates as $name => $operation ) { foreach ( $this->classname_updates as $name => $operation ) {
if ( self::ADD_CLASS === $operation ) { $comparable_name = $is_quirks ? strtolower( $name ) : $name;
if ( self::ADD_CLASS === $operation && ! in_array( $comparable_name, $seen, true ) ) {
$modified = true; $modified = true;
$class .= strlen( $class ) > 0 ? ' ' : ''; $class .= strlen( $class ) > 0 ? ' ' : '';
@ -3932,8 +3978,29 @@ class WP_HTML_Tag_Processor {
return false; return false;
} }
$this->classname_updates[ $class_name ] = self::ADD_CLASS; if ( self::QUIRKS_MODE !== $this->compat_mode ) {
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
return true;
}
/*
* Because class names are matched ASCII-case-insensitively in quirks mode,
* this needs to see if a case variant of the given class name is already
* enqueued and update that existing entry, if so. This picks the casing of
* the first-provided class name for all lexical variations.
*/
$class_name_length = strlen( $class_name );
foreach ( $this->classname_updates as $updated_name => $action ) {
if (
strlen( $updated_name ) === $class_name_length &&
0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true )
) {
$this->classname_updates[ $updated_name ] = self::ADD_CLASS;
return true;
}
}
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
return true; return true;
} }
@ -3953,10 +4020,29 @@ class WP_HTML_Tag_Processor {
return false; return false;
} }
if ( null !== $this->tag_name_starts_at ) { if ( self::QUIRKS_MODE !== $this->compat_mode ) {
$this->classname_updates[ $class_name ] = self::REMOVE_CLASS; $this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
return true;
} }
/*
* Because class names are matched ASCII-case-insensitively in quirks mode,
* this needs to see if a case variant of the given class name is already
* enqueued and update that existing entry, if so. This picks the casing of
* the first-provided class name for all lexical variations.
*/
$class_name_length = strlen( $class_name );
foreach ( $this->classname_updates as $updated_name => $action ) {
if (
strlen( $updated_name ) === $class_name_length &&
0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true )
) {
$this->classname_updates[ $updated_name ] = self::REMOVE_CLASS;
return true;
}
}
$this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
return true; return true;
} }
@ -4350,6 +4436,37 @@ class WP_HTML_Tag_Processor {
*/ */
const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML'; const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
/**
* No-quirks mode document compatability mode.
*
* > In no-quirks mode, the behavior is (hopefully) the desired behavior
* > described by the modern HTML and CSS specifications.
*
* @see self::$compat_mode
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
*
* @since 6.7.0
*
* @var string
*/
const NO_QUIRKS_MODE = 'no-quirks-mode';
/**
* Quirks mode document compatability mode.
*
* > In quirks mode, layout emulates behavior in Navigator 4 and Internet
* > Explorer 5. This is essential in order to support websites that were
* > built before the widespread adoption of web standards.
*
* @see self::$compat_mode
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
*
* @since 6.7.0
*
* @var string
*/
const QUIRKS_MODE = 'quirks-mode';
/** /**
* Indicates that a span of text may contain any combination of significant * Indicates that a span of text may contain any combination of significant
* kinds of characters: NULL bytes, whitespace, and others. * kinds of characters: NULL bytes, whitespace, and others.

View File

@ -16,7 +16,7 @@
* *
* @global string $wp_version * @global string $wp_version
*/ */
$wp_version = '6.7-alpha-58984'; $wp_version = '6.7-alpha-58985';
/** /**
* Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema.