diff --git a/wp-includes/html-api/class-wp-html-doctype-info.php b/wp-includes/html-api/class-wp-html-doctype-info.php new file mode 100644 index 0000000000..e0396f7d7d --- /dev/null +++ b/wp-includes/html-api/class-wp-html-doctype-info.php @@ -0,0 +1,616 @@ +`. + * + * > DOCTYPEs are required for legacy reasons. When omitted, browsers tend to use a different + * > rendering mode that is incompatible with some specifications. Including the DOCTYPE in a + * > document ensures that the browser makes a best-effort attempt at following the + * > relevant specifications. + * + * @see https://html.spec.whatwg.org/#the-doctype + * + * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier, + * and an indication of which document compatability mode they would imply if an HTML parser + * hadn't already determined it from other information. + * + * @see https://html.spec.whatwg.org/#the-initial-insertion-mode + * + * Historically, the DOCTYPE declaration was used in SGML documents to instruct a parser how + * to interpret the various tags and entities within a document. Its role in HTML diverged + * from how it was used in SGML and no meaning should be back-read into HTML based on how it + * is used in SGML, XML, or XHTML documents. + * + * @see https://www.iso.org/standard/16387.html + * + * @since 6.7.0 + * + * @see WP_HTML_Processor + */ +class WP_HTML_Doctype_Info { + /** + * Name of the DOCTYPE: should be "html" for HTML documents. + * + * This value should be considered "read only" and not modified. + * + * Historically the DOCTYPE name indicates name of the document's root element. + * + * + * ╰──┴── name is "html". + * + * @see https://html.spec.whatwg.org/#tokenization + * + * @since 6.7.0 + * + * @var string|null + */ + public $name = null; + + /** + * Public identifier of the DOCTYPE. + * + * This value should be considered "read only" and not modified. + * + * The public identifier is optional and should not appear in HTML documents. + * A `null` value indicates that no public identifier was present in the DOCTYPE. + * + * Historically the presence of the public identifier indicated that a document + * was meant to be shared between computer systems and the value indicated to a + * knowledgeable parser how to find the relevant document type definition (DTD). + * + * + * │ │ ╰─── public identifier ─────╯ + * ╰──┴── name is "html". + * + * @see https://html.spec.whatwg.org/#tokenization + * + * @since 6.7.0 + * + * @var string|null + */ + public $public_identifier = null; + + /** + * System identifier of the DOCTYPE. + * + * This value should be considered "read only" and not modified. + * + * The system identifier is optional and should not appear in HTML documents. + * A `null` value indicates that no system identifier was present in the DOCTYPE. + * + * Historically the system identifier specified where a relevant document type + * declaration for the given document is stored and may be retrieved. + * + * + * │ │ ╰──── system identifier ────╯ + * ╰──┴── name is "html". + * + * If a public identifier were provided it would indicate to a knowledgeable + * parser how to interpret the system identifier. + * + * + * │ │ ╰─── public identifier ─────╯ ╰──── system identifier ────╯ + * ╰──┴── name is "html". + * + * @see https://html.spec.whatwg.org/#tokenization + * + * @since 6.7.0 + * + * @var string|null + */ + public $system_identifier = null; + + /** + * Which document compatability mode this DOCTYPE declaration indicates. + * + * This value should be considered "read only" and not modified. + * + * When an HTML parser has not already set the document compatability mode, + * (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties + * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can + * indicate one of three possible document compatability modes: + * + * - "no-quirks" and "limited-quirks" modes (also called "standards" mode). + * - "quirks" mode (also called `CSS1Compat` mode). + * + * An appropriate DOCTYPE is one encountered in the "initial" insertion mode, + * before the HTML element has been opened and before finding any other + * DOCTYPE declaration tokens. + * + * @see https://html.spec.whatwg.org/#the-initial-insertion-mode + * + * @since 6.7.0 + * + * @var string One of "no-quirks", "limited-quirks", or "quirks". + */ + public $indicated_compatability_mode; + + /** + * Constructor. + * + * This class should not be instantiated directly. + * Use the static {@see self::from_doctype_token} method instead. + * + * The arguments to this constructor correspond to the "DOCTYPE token" + * as defined in the HTML specification. + * + * > DOCTYPE tokens have a name, a public identifier, a system identifier, + * > and a force-quirks flag. When a DOCTYPE token is created, its name, public identifier, + * > and system identifier must be marked as missing (which is a distinct state from the + * > empty string), and the force-quirks flag must be set to off (its other state is on). + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#tokenization + * + * @since 6.7.0 + * + * @param string|null $name Name of the DOCTYPE. + * @param string|null $public_identifier Public identifier of the DOCTYPE. + * @param string|null $system_identifier System identifier of the DOCTYPE. + * @param bool $force_quirks_flag Whether the force-quirks flag is set for the token. + */ + private function __construct( + ?string $name, + ?string $public_identifier, + ?string $system_identifier, + bool $force_quirks_flag + ) { + $this->name = $name; + $this->public_identifier = $public_identifier; + $this->system_identifier = $system_identifier; + + /* + * > If the DOCTYPE token matches one of the conditions in the following list, + * > then set the Document to quirks mode: + */ + + /* + * > The force-quirks flag is set to on. + */ + if ( $force_quirks_flag ) { + $this->indicated_compatability_mode = 'quirks'; + return; + } + + /* + * Normative documents will contain the literal `` with no + * public or system identifiers; short-circuit to avoid extra parsing. + */ + if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) { + $this->indicated_compatability_mode = 'no-quirks'; + return; + } + + /* + * > The name is not "html". + * + * The tokenizer must report the name in lower case even if provided in + * the document in upper case; thus no conversion is required here. + */ + if ( 'html' !== $name ) { + $this->indicated_compatability_mode = 'quirks'; + return; + } + + /* + * Set up some variables to handle the rest of the conditions. + * + * > set...the public identifier...to...the empty string if the public identifier was missing. + * > set...the system identifier...to...the empty string if the system identifier was missing. + * > + * > The system identifier and public identifier strings must be compared... + * > in an ASCII case-insensitive manner. + * > + * > A system identifier whose value is the empty string is not considered missing + * > for the purposes of the conditions above. + */ + $system_identifier_is_missing = null === $system_identifier; + $public_identifier = null === $public_identifier ? '' : strtolower( $public_identifier ); + $system_identifier = null === $system_identifier ? '' : strtolower( $system_identifier ); + + /* + * > The public identifier is set to… + */ + if ( + '-//w3o//dtd w3 html strict 3.0//en//' === $public_identifier || + '-/w3c/dtd html 4.0 transitional/en' === $public_identifier || + 'html' === $public_identifier + ) { + $this->indicated_compatability_mode = 'quirks'; + return; + } + + /* + * > The system identifier is set to… + */ + if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) { + $this->indicated_compatability_mode = 'quirks'; + return; + } + + /* + * All of the following conditions depend on matching the public identifier. + * If the public identifier is empty, none of the following conditions will match. + */ + if ( '' === $public_identifier ) { + $this->indicated_compatability_mode = 'no-quirks'; + return; + } + + /* + * > The public identifier starts with… + * + * @todo Optimize this matching. It shouldn't be a large overall performance issue, + * however, as only a single DOCTYPE declaration token should ever be parsed, + * and normative documents will have exited before reaching this condition. + */ + if ( + str_starts_with( $public_identifier, '+//silmaril//dtd html pro v0r11 19970101//' ) || + str_starts_with( $public_identifier, '-//as//dtd html 3.0 aswedit + extensions//' ) || + str_starts_with( $public_identifier, '-//advasoft ltd//dtd html 3.0 aswedit + extensions//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 1//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 level 2//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 1//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict level 2//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.0 strict//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.0//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 2.1e//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 3.0//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 3.2 final//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 3.2//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html 3//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html level 0//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html level 1//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html level 2//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html level 3//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html strict level 0//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html strict level 1//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html strict level 2//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html strict level 3//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html strict//' ) || + str_starts_with( $public_identifier, '-//ietf//dtd html//' ) || + str_starts_with( $public_identifier, '-//metrius//dtd metrius presentational//' ) || + str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html strict//' ) || + str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 html//' ) || + str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 2.0 tables//' ) || + str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html strict//' ) || + str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 html//' ) || + str_starts_with( $public_identifier, '-//microsoft//dtd internet explorer 3.0 tables//' ) || + str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd html//' ) || + str_starts_with( $public_identifier, '-//netscape comm. corp.//dtd strict html//' ) || + str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html 2.0//" ) || + str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended 1.0//" ) || + str_starts_with( $public_identifier, "-//o'reilly and associates//dtd html extended relaxed 1.0//" ) || + str_starts_with( $public_identifier, '-//sq//dtd html 2.0 hotmetal + extensions//' ) || + str_starts_with( $public_identifier, '-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//' ) || + str_starts_with( $public_identifier, '-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//' ) || + str_starts_with( $public_identifier, '-//spyglass//dtd html 2.0 extended//' ) || + str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava html//' ) || + str_starts_with( $public_identifier, '-//sun microsystems corp.//dtd hotjava strict html//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 3 1995-03-24//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 draft//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 3.2 final//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 3.2//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 3.2s draft//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 frameset//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 4.0 transitional//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html experimental 19960712//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html experimental 970421//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd w3 html//' ) || + str_starts_with( $public_identifier, '-//w3o//dtd w3 html 3.0//' ) || + str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) || + str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' ) + ) { + $this->indicated_compatability_mode = 'quirks'; + return; + } + + /* + * > The system identifier is missing and the public identifier starts with… + */ + if ( + $system_identifier_is_missing && ( + str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' ) + ) + ) { + $this->indicated_compatability_mode = 'quirks'; + return; + } + + /* + * > Otherwise, if the DOCTYPE token matches one of the conditions in + * > the following list, then set the Document to limited-quirks mode. + */ + + /* + * > The public identifier starts with… + */ + if ( + str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' ) + ) { + $this->indicated_compatability_mode = 'limited-quirks'; + return; + } + + /* + * > The system identifier is not missing and the public identifier starts with… + */ + if ( + ! $system_identifier_is_missing && ( + str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 frameset//' ) || + str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' ) + ) + ) { + $this->indicated_compatability_mode = 'limited-quirks'; + return; + } + + $this->indicated_compatability_mode = 'no-quirks'; + } + + /** + * Creates a WP_HTML_Doctype_Info instance by parsing a raw DOCTYPE declaration token. + * + * Use this method to parse a DOCTYPE declaration token and get access to its properties + * via the returned WP_HTML_Doctype_Info class instance. The provided input must parse + * properly as a DOCTYPE declaration, though it must not represent a valid DOCTYPE. + * + * Example: + * + * // Normative HTML DOCTYPE declaration. + * $doctype = WP_HTML_Doctype_Info::from_doctype_token( '' ); + * 'no-quirks' === $doctype->indicated_compatability_mode; + * + * // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode. + * $doctype = WP_HTML_Doctype_Info::from_doctype_token( '' ); + * 'quirks' === $doctype->indicated_compatability_mode; + * + * // Textual quirks present in raw HTML are handled appropriately. + * $doctype = WP_HTML_Doctype_Info::from_doctype_token( "" ); + * 'no-quirks' === $doctype->indicated_compatability_mode; + * + * // Anything other than a proper DOCTYPE declaration token fails to parse. + * null === WP_HTML_Doctype_Info::from_doctype_token( ' ' ); + * null === WP_HTML_Doctype_Info::from_doctype_token( '

' ); + * null === WP_HTML_Doctype_Info::from_doctype_token( '' ); + * null === WP_HTML_Doctype_Info::from_doctype_token( 'html' ); + * null === WP_HTML_Doctype_Info::from_doctype_token( '' ); + * + * @since 6.7.0 + * + * @param string $doctype_html The complete raw DOCTYPE HTML string, e.g. ``. + * + * @return WP_HTML_Doctype_Info|null A WP_HTML_Doctype_Info instance will be returned if the + * provided DOCTYPE HTML is a valid DOCTYPE. Otherwise, null. + */ + public static function from_doctype_token( string $doctype_html ): ?self { + $doctype_name = null; + $doctype_public_id = null; + $doctype_system_id = null; + + $end = strlen( $doctype_html ) - 1; + + /* + * This parser combines the rules for parsing DOCTYPE tokens found in the HTML + * specification for the DOCTYPE related tokenizer states. + * + * @see https://html.spec.whatwg.org/#doctype-state + */ + + /* + * - Valid DOCTYPE HTML token must be at least `` assuming a complete token not + * ending in end-of-file. + * - It must start with an ASCII case-insensitive match for `` must be the final byte in the HTML string. + */ + if ( + $end < 9 || + 0 !== substr_compare( $doctype_html, '`? + if ( '>' !== $doctype_html[ $end ] || ( strcspn( $doctype_html, '>', $at ) + $at ) < $end ) { + return null; + } + + /* + * Perform newline normalization and ensure the $end value is correct after normalization. + * + * @see https://html.spec.whatwg.org/#preprocessing-the-input-stream + * @see https://infra.spec.whatwg.org/#normalize-newlines + */ + $doctype_html = str_replace( "\r\n", "\n", $doctype_html ); + $doctype_html = str_replace( "\r", "\n", $doctype_html ); + $end = strlen( $doctype_html ) - 1; + + /* + * In this state, the doctype token has been found and its "content" optionally including the + * name, public identifier, and system identifier is between the current position and the end. + * + * "" + * ╰─ $at ╰─ $end + * + * It's also possible that the declaration part is empty. + * + * ╭─ $at + * "" + * ╰─ $end + * + * Rules for parsing ">" which terminates the DOCTYPE do not need to be considered as they + * have been handled above in the condition that the provided DOCTYPE HTML must contain + * exactly one ">" character in the final position. + */ + + /* + * + * Parsing effectively begins in "Before DOCTYPE name state". Ignore whitespace and + * proceed to the next state. + * + * @see https://html.spec.whatwg.org/#before-doctype-name-state + */ + $at += strspn( $doctype_html, " \t\n\f\r", $at ); + + if ( $at >= $end ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + + $name_length = strcspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); + $doctype_name = str_replace( "\0", "\u{FFFD}", strtolower( substr( $doctype_html, $at, $name_length ) ) ); + + $at += $name_length; + $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); + if ( $at >= $end ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); + } + + /* + * "After DOCTYPE name state" + * + * Find a case-insensitive match for "PUBLIC" or "SYSTEM" at this point. + * Otherwise, set force-quirks and enter bogus DOCTYPE state (skip the rest of the doctype). + * + * @see https://html.spec.whatwg.org/#after-doctype-name-state + */ + if ( $at + 6 >= $end ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + + /* + * > If the six characters starting from the current input character are an ASCII + * > case-insensitive match for the word "PUBLIC", then consume those characters + * > and switch to the after DOCTYPE public keyword state. + */ + if ( 0 === substr_compare( $doctype_html, 'PUBLIC', $at, 6, true ) ) { + $at += 6; + $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); + if ( $at >= $end ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + goto parse_doctype_public_identifier; + } + + /* + * > Otherwise, if the six characters starting from the current input character are an ASCII + * > case-insensitive match for the word "SYSTEM", then consume those characters and switch + * > to the after DOCTYPE system keyword state. + */ + if ( 0 === substr_compare( $doctype_html, 'SYSTEM', $at, 6, true ) ) { + $at += 6; + $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); + if ( $at >= $end ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + goto parse_doctype_system_identifier; + } + + /* + * > Otherwise, this is an invalid-character-sequence-after-doctype-name parse error. + * > Set the current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus + * > DOCTYPE state. + */ + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + + parse_doctype_public_identifier: + /* + * The parser should enter "DOCTYPE public identifier (double-quoted) state" or + * "DOCTYPE public identifier (single-quoted) state" by finding one of the valid quotes. + * Anything else forces quirks mode and ignores the rest of the contents. + * + * @see https://html.spec.whatwg.org/#doctype-public-identifier-(double-quoted)-state + * @see https://html.spec.whatwg.org/#doctype-public-identifier-(single-quoted)-state + */ + $closer_quote = $doctype_html[ $at ]; + + /* + * > This is a missing-quote-before-doctype-public-identifier parse error. Set the + * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state. + */ + if ( '"' !== $closer_quote && "'" !== $closer_quote ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + + ++$at; + + $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at ); + $doctype_public_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) ); + + $at += $identifier_length; + if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + + ++$at; + + /* + * "Between DOCTYPE public and system identifiers state" + * + * Advance through whitespace between public and system identifiers. + * + * @see https://html.spec.whatwg.org/#between-doctype-public-and-system-identifiers-state + */ + $at += strspn( $doctype_html, " \t\n\f\r", $at, $end - $at ); + if ( $at >= $end ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); + } + + parse_doctype_system_identifier: + /* + * The parser should enter "DOCTYPE system identifier (double-quoted) state" or + * "DOCTYPE system identifier (single-quoted) state" by finding one of the valid quotes. + * Anything else forces quirks mode and ignores the rest of the contents. + * + * @see https://html.spec.whatwg.org/#doctype-system-identifier-(double-quoted)-state + * @see https://html.spec.whatwg.org/#doctype-system-identifier-(single-quoted)-state + */ + $closer_quote = $doctype_html[ $at ]; + + /* + * > This is a missing-quote-before-doctype-system-identifier parse error. Set the + * > current DOCTYPE token's force-quirks flag to on. Reconsume in the bogus DOCTYPE state. + */ + if ( '"' !== $closer_quote && "'" !== $closer_quote ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + + ++$at; + + $identifier_length = strcspn( $doctype_html, $closer_quote, $at, $end - $at ); + $doctype_system_id = str_replace( "\0", "\u{FFFD}", substr( $doctype_html, $at, $identifier_length ) ); + + $at += $identifier_length; + if ( $at >= $end || $closer_quote !== $doctype_html[ $at ] ) { + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, true ); + } + + return new self( $doctype_name, $doctype_public_id, $doctype_system_id, false ); + } +} diff --git a/wp-includes/html-api/class-wp-html-processor.php b/wp-includes/html-api/class-wp-html-processor.php index 9f3249db74..ca7a4cf3e0 100644 --- a/wp-includes/html-api/class-wp-html-processor.php +++ b/wp-includes/html-api/class-wp-html-processor.php @@ -1076,19 +1076,16 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > A DOCTYPE token */ case 'html': - $contents = $this->get_modifiable_text(); - if ( ' html' !== $contents ) { - /* - * @todo When the HTML Tag Processor fully parses the DOCTYPE declaration, - * this code should examine the contents to set the compatability mode. - */ - $this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' ); + $doctype = $this->get_doctype_info(); + if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) { + $this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE; } /* * > Then, switch the insertion mode to "before html". */ $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; + $this->insert_html_element( $this->state->current_token ); return true; } @@ -1096,6 +1093,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * > Anything else */ initial_anything_else: + $this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE; $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML; return $this->step( self::REPROCESS_CURRENT_NODE ); } diff --git a/wp-includes/html-api/class-wp-html-tag-processor.php b/wp-includes/html-api/class-wp-html-tag-processor.php index e6e704e71c..72307cb392 100644 --- a/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/wp-includes/html-api/class-wp-html-tag-processor.php @@ -4026,6 +4026,27 @@ class WP_HTML_Tag_Processor { return true; } + /** + * Gets DOCTYPE declaration info from a DOCTYPE token. + * + * DOCTYPE tokens may appear in many places in an HTML document. In most places, they are + * simply ignored. The main parsing functions find the basic shape of DOCTYPE tokens but + * do not perform detailed parsing. + * + * This method can be called to perform a full parse of the DOCTYPE token and retrieve + * its information. + * + * @return WP_HTML_Doctype_Info|null The DOCTYPE declaration information or `null` if not + * currently at a DOCTYPE node. + */ + public function get_doctype_info(): ?WP_HTML_Doctype_Info { + if ( self::STATE_DOCTYPE !== $this->parser_state ) { + return null; + } + + return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->token_length ) ); + } + /** * Parser Ready State. * @@ -4117,7 +4138,7 @@ class WP_HTML_Tag_Processor { /** * Indicates that the parser has found a DOCTYPE node and it's - * possible to read and modify its modifiable text. + * possible to read its DOCTYPE information via `get_doctype_info()`. * * @since 6.5.0 * diff --git a/wp-includes/version.php b/wp-includes/version.php index 447ccadbff..ef7e74496f 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '6.7-alpha-58924'; +$wp_version = '6.7-alpha-58925'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. diff --git a/wp-settings.php b/wp-settings.php index 369493f3fc..d3dfe5776e 100644 --- a/wp-settings.php +++ b/wp-settings.php @@ -252,6 +252,7 @@ require ABSPATH . WPINC . '/http.php'; require ABSPATH . WPINC . '/html-api/html5-named-character-references.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-span.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-doctype-info.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php';