From 97a55acee51c7d7a4237e40d55fe4f2120e1956b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 17 May 2024 14:59:15 -0700 Subject: [PATCH] KSES: Replace attribute parsing with HTML API. In this patch several kses functions have been rewritten to rely on the spec-compliant parsing made available in the HTML API. These functions convert HTML input into an array of parsed attribute values. By relying on the HTML API it's possible to recognize the HTML in the same way that a browser would, removing a layer of custom interpretation, full of discrepancies and bugs. --- src/wp-includes/kses.php | 267 +++++++++++---------------------------- 1 file changed, 73 insertions(+), 194 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index cccb1768c2dfd..42070a2fae39c 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1323,257 +1323,136 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe * attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`). * * @since 1.0.0 + * @since 6.6.0 Based on the HTML API. * * @param string $attr Attribute list from HTML element to closing HTML element tag. * @param string[] $allowed_protocols Array of allowed URL protocols. * @return array[] Array of attribute information after parsing. */ function wp_kses_hair( $attr, $allowed_protocols ) { - $attrarr = array(); - $mode = 0; - $attrname = ''; - $uris = wp_kses_uri_attributes(); - - // Loop through the whole attribute list. - - while ( strlen( $attr ) !== 0 ) { - $working = 0; // Was the last operation successful? - - switch ( $mode ) { - case 0: - if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) { - $attrname = $match[1]; - $working = 1; - $mode = 1; - $attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr ); - } - - break; - - case 1: - if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign. - $working = 1; - $mode = 2; - $attr = preg_replace( '/^\s*=\s*/', '', $attr ); - break; - } - - if ( preg_match( '/^\s+/', $attr ) ) { // Valueless. - $working = 1; - $mode = 0; - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => '', - 'whole' => $attrname, - 'vless' => 'y', - ); - } + $processor = new WP_HTML_Tag_Processor( "" ); + if ( ! $processor->next_token() ) { + return array(); + } - $attr = preg_replace( '/^\s+/', '', $attr ); - } + $attributes = array(); + $uris = wp_kses_uri_attributes(); - break; - - case 2: - if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) { - // "value" - $thisval = $match[1]; - if ( in_array( strtolower( $attrname ), $uris, true ) ) { - $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols ); - } - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => $thisval, - 'whole' => "$attrname=\"$thisval\"", - 'vless' => 'n', - ); - } - - $working = 1; - $mode = 0; - $attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr ); - break; - } + $parsed_attributes = $processor->get_attribute_names_with_prefix( '' ); + if ( ! isset( $parsed_attributes ) ) { + return array(); + } - if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) { - // 'value' - $thisval = $match[1]; - if ( in_array( strtolower( $attrname ), $uris, true ) ) { - $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols ); - } - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => $thisval, - 'whole' => "$attrname='$thisval'", - 'vless' => 'n', - ); - } - - $working = 1; - $mode = 0; - $attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr ); - break; - } + foreach ( $parsed_attributes as $attribute_name ) { + $value = $processor->get_attribute( $attribute_name ); - if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) { - // value - $thisval = $match[1]; - if ( in_array( strtolower( $attrname ), $uris, true ) ) { - $thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols ); - } - - if ( false === array_key_exists( $attrname, $attrarr ) ) { - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => $thisval, - 'whole' => "$attrname=\"$thisval\"", - 'vless' => 'n', - ); - } - - // We add quotes to conform to W3C's HTML spec. - $working = 1; - $mode = 0; - $attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr ); - } + if ( true === $value ) { + $attributes[ $attribute_name ] = array( + 'name' => $attribute_name, + 'value' => '', + 'whole' => $attribute_name, + 'vless' => 'y', + ); + } else { + if ( in_array( strtolower( $attribute_name ), $uris, true ) ) { + $value = wp_kses_bad_protocol( $value, $allowed_protocols ); + } - break; - } // End switch. + $escaped_value = str_replace( '"', '"', $value ); - if ( 0 === $working ) { // Not well-formed, remove and try again. - $attr = wp_kses_html_error( $attr ); - $mode = 0; + $attributes[ $attribute_name ] = array( + 'name' => $attribute_name, + 'value' => $value, + 'whole' => "{$attribute_name}=\"{$escaped_value}\"", + 'vless' => 'n', + ); } - } // End while. - - if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) { - /* - * Special case, for when the attribute list ends with a valueless - * attribute like "selected". - */ - $attrarr[ $attrname ] = array( - 'name' => $attrname, - 'value' => '', - 'whole' => $attrname, - 'vless' => 'y', - ); } - return $attrarr; + return $attributes; } /** * Finds all attributes of an HTML element. * - * Does not modify input. May return "evil" output. - * - * Based on `wp_kses_split2()` and `wp_kses_attr()`. + * Does not modify input. * * @since 4.2.3 + * @since 6.6.0 Based on the HTML API. * * @param string $element HTML element. * @return array|false List of attributes found in the element. Returns false on failure. */ function wp_kses_attr_parse( $element ) { - $valid = preg_match( '%^(<\s*)(/\s*)?([a-zA-Z0-9]+\s*)([^>]*)(>?)$%', $element, $matches ); - if ( 1 !== $valid ) { + $processor = new WP_HTML_Tag_Processor( $element ); + if ( ! $processor->next_token() || '#tag' !== $processor->get_token_type() ) { return false; } - $begin = $matches[1]; - $slash = $matches[2]; - $elname = $matches[3]; - $attr = $matches[4]; - $end = $matches[5]; + $tag_name = substr( $element, 1, strlen( $processor->get_tag() ) ); + $chunks = array( "<{$tag_name} " ); - if ( '' !== $slash ) { - // Closing elements do not get parsed. - return false; + $parsed_attributes = $processor->get_attribute_names_with_prefix( '' ); + if ( ! isset( $parsed_attributes ) ) { + $parsed_attributes = array(); } - // Is there a closing XHTML slash at the end of the attributes? - if ( 1 === preg_match( '%\s*/\s*$%', $attr, $matches ) ) { - $xhtml_slash = $matches[0]; - $attr = substr( $attr, 0, -strlen( $xhtml_slash ) ); - } else { - $xhtml_slash = ''; + foreach ( $parsed_attributes as $attribute_name ) { + $value = $processor->get_attribute( $attribute_name ); + if ( true === $value ) { + $chunks[] = $attribute_name; + } else { + $value = str_replace( '"', '"', $value ); + $chunks[] = "{$attribute_name}=\"{$value}\""; + } } - // Split it. - $attrarr = wp_kses_hair_parse( $attr ); - if ( false === $attrarr ) { + $chunks[] = '>'; + + /* + * There should have been no more content available in the HTML string. + * If there had been, it would imply that the `$attr` string was + * incorrectly parsed and broke out of the tag segment. + */ + if ( false === $processor->next_token() ) { + return $chunks; + } else { return false; } - - // Make sure all input is returned by adding front and back matter. - array_unshift( $attrarr, $begin . $slash . $elname ); - array_push( $attrarr, $xhtml_slash . $end ); - - return $attrarr; } /** * Builds an attribute list from string containing attributes. * - * Does not modify input. May return "evil" output. + * Does not modify input. * In case of unexpected input, returns false instead of stripping things. * * Based on `wp_kses_hair()` but does not return a multi-dimensional array. * * @since 4.2.3 + * @since 6.6.0 Based on the HTML API. * * @param string $attr Attribute list from HTML element to closing HTML element tag. + * * @return array|false List of attributes found in $attr. Returns false on failure. */ function wp_kses_hair_parse( $attr ) { - if ( '' === $attr ) { + $chunks = wp_kses_attr_parse( "" ); + if ( false === $chunks ) { + return false; + } + + if ( count( $chunks ) <= 2 ) { return array(); } - $regex = - '(?: - [_a-zA-Z][-_a-zA-Z0-9:.]* # Attribute name. - | - \[\[?[^\[\]]+\]\]? # Shortcode in the name position implies unfiltered_html. - ) - (?: # Attribute value. - \s*=\s* # All values begin with "=". - (?: - "[^"]*" # Double-quoted. - | - \'[^\']*\' # Single-quoted. - | - [^\s"\']+ # Non-quoted. - (?:\s|$) # Must have a space. - ) - | - (?:\s|$) # If attribute has no value, space is required. - ) - \s* # Trailing space is optional except as mentioned above. - '; + // Remove the tag opening. + array_shift( $chunks ); - /* - * Although it is possible to reduce this procedure to a single regexp, - * we must run that regexp twice to get exactly the expected result. - * - * Note: do NOT remove the `x` modifiers as they are essential for the above regex! - */ - - $validation = "/^($regex)+$/x"; - $extraction = "/$regex/x"; + // Remove the tag closing. + array_pop( $chunks ); - if ( 1 === preg_match( $validation, $attr ) ) { - preg_match_all( $extraction, $attr, $attrarr ); - return $attrarr[0]; - } else { - return false; - } + return $chunks; } /**