next_tag( array( 'breadcrumbs' => array( 'DIV', 'FIGURE', 'IMG' ) ) ) ) { * $processor->add_class( 'responsive-image' ); * } * * #### Breadcrumbs * * Breadcrumbs represent the stack of open elements from the root * of the document or fragment down to the currently-matched node, * if one is currently selected. Call WP_HTML_Processor::get_breadcrumbs() * to inspect the breadcrumbs for a matched tag. * * Breadcrumbs can specify nested HTML structure and are equivalent * to a CSS selector comprising tag names separated by the child * combinator, such as "DIV > FIGURE > IMG". * * Since all elements find themselves inside a full HTML document * when parsed, the return value from `get_breadcrumbs()` will always * contain any implicit outermost elements. For example, when parsing * with `create_fragment()` in the `BODY` context (the default), any * tag in the given HTML document will contain `array( 'HTML', 'BODY', … )` * in its breadcrumbs. * * Despite containing the implied outermost elements in their breadcrumbs, * tags may be found with the shortest-matching breadcrumb query. That is, * `array( 'IMG' )` matches all IMG elements and `array( 'P', 'IMG' )` * matches all IMG elements directly inside a P element. To ensure that no * partial matches erroneously match it's possible to specify in a query * the full breadcrumb match all the way down from the root HTML element. * * Example: * * $html = '
A lovely day outside
'; * // ----- Matches here. * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'IMG' ) ) ); * * $html = '
A lovely day outside
'; * // ---- Matches here. * $processor->next_tag( array( 'breadcrumbs' => array( 'FIGURE', 'FIGCAPTION', 'EM' ) ) ); * * $html = '
'; * // ----- Matches here, because IMG must be a direct child of the implicit BODY. * $processor->next_tag( array( 'breadcrumbs' => array( 'BODY', 'IMG' ) ) ); * * ## HTML Support * * This class implements a small part of the HTML5 specification. * It's designed to operate within its support and abort early whenever * encountering circumstances it can't properly handle. This is * the principle way in which this class remains as simple as possible * without cutting corners and breaking compliance. * * ### Supported elements * * If any unsupported element appears in the HTML input the HTML Processor * will abort early and stop all processing. This draconian measure ensures * that the HTML Processor won't break any HTML it doesn't fully understand. * * The HTML Processor supports all elements other than a specific set: * * - Any element inside a TABLE. * - Any element inside foreign content, including SVG and MATH. * - Any element outside the IN BODY insertion mode, e.g. doctype declarations, meta, links. * * ### Supported markup * * Some kinds of non-normative HTML involve reconstruction of formatting elements and * re-parenting of mis-nested elements. For example, a DIV tag found inside a TABLE * may in fact belong _before_ the table in the DOM. If the HTML Processor encounters * such a case it will stop processing. * * The following list illustrates some common examples of unexpected HTML inputs that * the HTML Processor properly parses and represents: * * - HTML with optional tags omitted, e.g. `

one

two`. * - HTML with unexpected tag closers, e.g. `

one more

`. * - Non-void tags with self-closing flag, e.g. `
the DIV is still open.
`. * - Heading elements which close open heading elements of another level, e.g. `

Closed by

`. * - Elements containing text that looks like other tags but isn't, e.g. `The <img> is plaintext`. * - SCRIPT and STYLE tags containing text that looks like HTML but isn't, e.g. ``. * - SCRIPT content which has been escaped, e.g. ``. * * ### Unsupported Features * * This parser does not report parse errors. * * Normally, when additional HTML or BODY tags are encountered in a document, if there * are any additional attributes on them that aren't found on the previous elements, * the existing HTML and BODY elements adopt those missing attribute values. This * parser does not add those additional attributes. * * In certain situations, elements are moved to a different part of the document in * a process called "adoption" and "fostering." Because the nodes move to a location * in the document that the parser had already processed, this parser does not support * these situations and will bail. * * @since 6.4.0 * * @see WP_HTML_Tag_Processor * @see https://html.spec.whatwg.org/ */ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * The maximum number of bookmarks allowed to exist at any given time. * * HTML processing requires more bookmarks than basic tag processing, * so this class constant from the Tag Processor is overwritten. * * @since 6.4.0 * * @var int */ const MAX_BOOKMARKS = 100; /** * Holds the working state of the parser, including the stack of * open elements and the stack of active formatting elements. * * Initialized in the constructor. * * @since 6.4.0 * * @var WP_HTML_Processor_State */ private $state; /** * Used to create unique bookmark names. * * This class sets a bookmark for every tag in the HTML document that it encounters. * The bookmark name is auto-generated and increments, starting with `1`. These are * internal bookmarks and are automatically released when the referring WP_HTML_Token * goes out of scope and is garbage-collected. * * @since 6.4.0 * * @see WP_HTML_Processor::$release_internal_bookmark_on_destruct * * @var int */ private $bookmark_counter = 0; /** * Stores an explanation for why something failed, if it did. * * @see self::get_last_error * * @since 6.4.0 * * @var string|null */ private $last_error = null; /** * Stores context for why the parser bailed on unsupported HTML, if it did. * * @see self::get_unsupported_exception * * @since 6.7.0 * * @var WP_HTML_Unsupported_Exception|null */ private $unsupported_exception = null; /** * Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance. * * This function is created inside the class constructor so that it can be passed to * the stack of open elements and the stack of active formatting elements without * exposing it as a public method on the class. * * @since 6.4.0 * * @var Closure|null */ private $release_internal_bookmark_on_destruct = null; /** * Stores stack events which arise during parsing of the * HTML document, which will then supply the "match" events. * * @since 6.6.0 * * @var WP_HTML_Stack_Event[] */ private $element_queue = array(); /** * Stores the current breadcrumbs. * * @since 6.7.0 * * @var string[] */ private $breadcrumbs = array(); /** * Current stack event, if set, representing a matched token. * * Because the parser may internally point to a place further along in a document * than the nodes which have already been processed (some "virtual" nodes may have * appeared while scanning the HTML document), this will point at the "current" node * being processed. It comes from the front of the element queue. * * @since 6.6.0 * * @var WP_HTML_Stack_Event|null */ private $current_element = null; /** * Context node if created as a fragment parser. * * @var WP_HTML_Token|null */ private $context_node = null; /* * Public Interface Functions */ /** * Creates an HTML processor in the fragment parsing mode. * * Use this for cases where you are processing chunks of HTML that * will be found within a bigger HTML document, such as rendered * block output that exists within a post, `the_content` inside a * rendered site layout. * * Fragment parsing occurs within a context, which is an HTML element * that the document will eventually be placed in. It becomes important * when special elements have different rules than others, such as inside * a TEXTAREA or a TITLE tag where things that look like tags are text, * or inside a SCRIPT tag where things that look like HTML syntax are JS. * * The context value should be a representation of the tag into which the * HTML is found. For most cases this will be the body element. The HTML * form is provided because a context element may have attributes that * impact the parse, such as with a SCRIPT tag and its `type` attribute. * * ## Current HTML Support * * - The only supported context is ``, which is the default value. * - The only supported document encoding is `UTF-8`, which is the default value. * * @since 6.4.0 * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances. * * @param string $html Input HTML fragment to process. * @param string $context Context element for the fragment, must be default of ``. * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. * @return static|null The created processor if successful, otherwise null. */ public static function create_fragment( $html, $context = '', $encoding = 'UTF-8' ) { if ( '' !== $context || 'UTF-8' !== $encoding ) { return null; } $context_processor = static::create_full_parser( "{$context}", $encoding ); if ( null === $context_processor ) { return null; } while ( $context_processor->next_tag() ) { $context_processor->set_bookmark( 'final_node' ); } if ( ! $context_processor->has_bookmark( 'final_node' ) || ! $context_processor->seek( 'final_node' ) ) { _doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' ); return null; } return $context_processor->create_fragment_at_current_node( $html ); } /** * Creates an HTML processor in the full parsing mode. * * It's likely that a fragment parser is more appropriate, unless sending an * entire HTML document from start to finish. Consider a fragment parser with * a context node of ``. * * UTF-8 is the only allowed encoding. If working with a document that * isn't UTF-8, first convert the document to UTF-8, then pass in the * converted HTML. * * @param string $html Input HTML document to process. * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used * in the input byte stream. Currently must be UTF-8. * @return static|null The created processor if successful, otherwise null. */ public static function create_full_parser( $html, $known_definite_encoding = 'UTF-8' ) { if ( 'UTF-8' !== $known_definite_encoding ) { return null; } $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); $processor->state->encoding = $known_definite_encoding; $processor->state->encoding_confidence = 'certain'; return $processor; } /** * Constructor. * * Do not use this method. Use the static creator methods instead. * * @access private * * @since 6.4.0 * * @see WP_HTML_Processor::create_fragment() * * @param string $html HTML to process. * @param string|null $use_the_static_create_methods_instead This constructor should not be called manually. */ public function __construct( $html, $use_the_static_create_methods_instead = null ) { parent::__construct( $html ); if ( self::CONSTRUCTOR_UNLOCK_CODE !== $use_the_static_create_methods_instead ) { _doing_it_wrong( __METHOD__, sprintf( /* translators: %s: WP_HTML_Processor::create_fragment(). */ __( 'Call %s to create an HTML Processor instead of calling the constructor directly.' ), 'WP_HTML_Processor::create_fragment()' ), '6.4.0' ); } $this->state = new WP_HTML_Processor_State(); $this->state->stack_of_open_elements->set_push_handler( function ( WP_HTML_Token $token ): void { $is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer(); $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance ); $this->change_parsing_namespace( $token->integration_node_type ? 'html' : $token->namespace ); } ); $this->state->stack_of_open_elements->set_pop_handler( function ( WP_HTML_Token $token ): void { $is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer(); $same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name; $provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real'; $this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance ); $adjusted_current_node = $this->get_adjusted_current_node(); if ( $adjusted_current_node ) { $this->change_parsing_namespace( $adjusted_current_node->integration_node_type ? 'html' : $adjusted_current_node->namespace ); } else { $this->change_parsing_namespace( 'html' ); } } ); /* * Create this wrapper so that it's possible to pass * a private method into WP_HTML_Token classes without * exposing it to any public API. */ $this->release_internal_bookmark_on_destruct = function ( string $name ): void { parent::release_bookmark( $name ); }; } /** * Creates a fragment processor at the current node. * * HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be * instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`. * * The context node may impact how a fragment of HTML is parsed. For example, consider the HTML * fragment `Inside TD?`. * * A BODY context node will produce the following tree: * * └─#text Inside TD? * * Notice that the `` tags are completely ignored. * * Compare that with an SVG context node that produces the following tree: * * ├─svg:td * └─#text Inside TD? * * Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected. * This is a peculiarity of parsing HTML in foreign content like SVG. * * Finally, consider the tree produced with a TABLE context node: * * └─TBODY * └─TR * └─TD * └─#text Inside TD? * * These examples demonstrate how important the context node may be when processing an HTML * fragment. Special care must be taken when processing fragments that are expected to appear * in specific contexts. SVG and TABLE are good examples, but there are others. * * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm * * @since 6.8.0 * * @param string $html Input HTML fragment to process. * @return static|null The created processor if successful, otherwise null. */ private function create_fragment_at_current_node( string $html ) { if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) { _doing_it_wrong( __METHOD__, __( 'The context element must be a start tag.' ), '6.8.0' ); return null; } $tag_name = $this->current_element->token->node_name; $namespace = $this->current_element->token->namespace; if ( 'html' === $namespace && self::is_void( $tag_name ) ) { _doing_it_wrong( __METHOD__, sprintf( // translators: %s: A tag name like INPUT or BR. __( 'The context element cannot be a void element, found "%s".' ), $tag_name ), '6.8.0' ); return null; } /* * Prevent creating fragments at nodes that require a special tokenizer state. * This is unsupported by the HTML Processor. */ if ( 'html' === $namespace && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) ) { _doing_it_wrong( __METHOD__, sprintf( // translators: %s: A tag name like IFRAME or TEXTAREA. __( 'The context element "%s" is not supported.' ), $tag_name ), '6.8.0' ); return null; } $fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); $fragment_processor->compat_mode = $this->compat_mode; // @todo Create "fake" bookmarks for non-existent but implied nodes. $fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); $root_node = new WP_HTML_Token( 'root-node', 'HTML', false ); $fragment_processor->state->stack_of_open_elements->push( $root_node ); $fragment_processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); $fragment_processor->context_node = clone $this->current_element->token; $fragment_processor->context_node->bookmark_name = 'context-node'; $fragment_processor->context_node->on_destroy = null; $fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name ); if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) { $fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE; } $fragment_processor->reset_insertion_mode_appropriately(); /* * > Set the parser's form element pointer to the nearest node to the context element that * > is a form element (going straight up the ancestor chain, and including the element * > itself, if it is a form element), if any. (If there is no such form element, the * > form element pointer keeps its initial value, null.) */ foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) { if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) { $fragment_processor->state->form_element = clone $element; $fragment_processor->state->form_element->bookmark_name = null; $fragment_processor->state->form_element->on_destroy = null; break; } } $fragment_processor->state->encoding_confidence = 'irrelevant'; /* * Update the parsing namespace near the end of the process. * This is important so that any push/pop from the stack of open * elements does not change the parsing namespace. */ $fragment_processor->change_parsing_namespace( $this->current_element->token->integration_node_type ? 'html' : $namespace ); return $fragment_processor; } /** * Stops the parser and terminates its execution when encountering unsupported markup. * * @throws WP_HTML_Unsupported_Exception Halts execution of the parser. * * @since 6.7.0 * * @param string $message Explains support is missing in order to parse the current node. */ private function bail( string $message ) { $here = $this->bookmarks[ $this->state->current_token->bookmark_name ]; $token = substr( $this->html, $here->start, $here->length ); $open_elements = array(); foreach ( $this->state->stack_of_open_elements->stack as $item ) { $open_elements[] = $item->node_name; } $active_formats = array(); foreach ( $this->state->active_formatting_elements->walk_down() as $item ) { $active_formats[] = $item->node_name; } $this->last_error = self::ERROR_UNSUPPORTED; $this->unsupported_exception = new WP_HTML_Unsupported_Exception( $message, $this->state->current_token->node_name, $here->start, $token, $open_elements, $active_formats ); throw $this->unsupported_exception; } /** * Returns the last error, if any. * * Various situations lead to parsing failure but this class will * return `false` in all those cases. To determine why something * failed it's possible to request the last error. This can be * helpful to know to distinguish whether a given tag couldn't * be found or if content in the document caused the processor * to give up and abort processing. * * Example * * $processor = WP_HTML_Processor::create_fragment( '