Skip to content

Commit

Permalink
KSES: Replace attribute parsing with HTML API.
Browse files Browse the repository at this point in the history
In this patch several kses functions have been rewritten to rely on the
spec-compliant parsing made available in the HTML API. These functions
convert HTML input into an array of parsed attribute values. By relying
on the HTML API it's possible to recognize the HTML in the same way that
a browser would, removing a layer of custom interpretation, full of
discrepancies and bugs.
  • Loading branch information
dmsnell committed May 17, 2024
1 parent 6f7cd05 commit 97a55ac
Showing 1 changed file with 73 additions and 194 deletions.
267 changes: 73 additions & 194 deletions src/wp-includes/kses.php
Original file line number Diff line number Diff line change
Expand Up @@ -1323,257 +1323,136 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
* attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
*
* @since 1.0.0
* @since 6.6.0 Based on the HTML API.
*
* @param string $attr Attribute list from HTML element to closing HTML element tag.
* @param string[] $allowed_protocols Array of allowed URL protocols.
* @return array[] Array of attribute information after parsing.
*/
function wp_kses_hair( $attr, $allowed_protocols ) {
$attrarr = array();
$mode = 0;
$attrname = '';
$uris = wp_kses_uri_attributes();

// Loop through the whole attribute list.

while ( strlen( $attr ) !== 0 ) {
$working = 0; // Was the last operation successful?
switch ( $mode ) {
case 0:
if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
$attrname = $match[1];
$working = 1;
$mode = 1;
$attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
}

break;

case 1:
if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
$working = 1;
$mode = 2;
$attr = preg_replace( '/^\s*=\s*/', '', $attr );
break;
}

if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
$working = 1;
$mode = 0;

if ( false === array_key_exists( $attrname, $attrarr ) ) {
$attrarr[ $attrname ] = array(
'name' => $attrname,
'value' => '',
'whole' => $attrname,
'vless' => 'y',
);
}
$processor = new WP_HTML_Tag_Processor( "<fake-tag {$attr}>" );
if ( ! $processor->next_token() ) {
return array();
}

$attr = preg_replace( '/^\s+/', '', $attr );
}
$attributes = array();
$uris = wp_kses_uri_attributes();

break;

case 2:
if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
// "value"
$thisval = $match[1];
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
}

if ( false === array_key_exists( $attrname, $attrarr ) ) {
$attrarr[ $attrname ] = array(
'name' => $attrname,
'value' => $thisval,
'whole' => "$attrname=\"$thisval\"",
'vless' => 'n',
);
}

$working = 1;
$mode = 0;
$attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
break;
}
$parsed_attributes = $processor->get_attribute_names_with_prefix( '' );
if ( ! isset( $parsed_attributes ) ) {
return array();
}

if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
// 'value'
$thisval = $match[1];
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
}

if ( false === array_key_exists( $attrname, $attrarr ) ) {
$attrarr[ $attrname ] = array(
'name' => $attrname,
'value' => $thisval,
'whole' => "$attrname='$thisval'",
'vless' => 'n',
);
}

$working = 1;
$mode = 0;
$attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
break;
}
foreach ( $parsed_attributes as $attribute_name ) {
$value = $processor->get_attribute( $attribute_name );

if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
// value
$thisval = $match[1];
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
}

if ( false === array_key_exists( $attrname, $attrarr ) ) {
$attrarr[ $attrname ] = array(
'name' => $attrname,
'value' => $thisval,
'whole' => "$attrname=\"$thisval\"",
'vless' => 'n',
);
}

// We add quotes to conform to W3C's HTML spec.
$working = 1;
$mode = 0;
$attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
}
if ( true === $value ) {
$attributes[ $attribute_name ] = array(
'name' => $attribute_name,
'value' => '',
'whole' => $attribute_name,
'vless' => 'y',
);
} else {
if ( in_array( strtolower( $attribute_name ), $uris, true ) ) {
$value = wp_kses_bad_protocol( $value, $allowed_protocols );
}

break;
} // End switch.
$escaped_value = str_replace( '"', '&quot;', $value );

if ( 0 === $working ) { // Not well-formed, remove and try again.
$attr = wp_kses_html_error( $attr );
$mode = 0;
$attributes[ $attribute_name ] = array(
'name' => $attribute_name,
'value' => $value,
'whole' => "{$attribute_name}=\"{$escaped_value}\"",
'vless' => 'n',
);
}
} // End while.

if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
/*
* Special case, for when the attribute list ends with a valueless
* attribute like "selected".
*/
$attrarr[ $attrname ] = array(
'name' => $attrname,
'value' => '',
'whole' => $attrname,
'vless' => 'y',
);
}

return $attrarr;
return $attributes;
}

/**
* Finds all attributes of an HTML element.
*
* Does not modify input. May return "evil" output.
*
* Based on `wp_kses_split2()` and `wp_kses_attr()`.
* Does not modify input.
*
* @since 4.2.3
* @since 6.6.0 Based on the HTML API.
*
* @param string $element HTML element.
* @return array|false List of attributes found in the element. Returns false on failure.
*/
function wp_kses_attr_parse( $element ) {
$valid = preg_match( '%^(<\s*)(/\s*)?([a-zA-Z0-9]+\s*)([^>]*)(>?)$%', $element, $matches );
if ( 1 !== $valid ) {
$processor = new WP_HTML_Tag_Processor( $element );
if ( ! $processor->next_token() || '#tag' !== $processor->get_token_type() ) {
return false;
}

$begin = $matches[1];
$slash = $matches[2];
$elname = $matches[3];
$attr = $matches[4];
$end = $matches[5];
$tag_name = substr( $element, 1, strlen( $processor->get_tag() ) );
$chunks = array( "<{$tag_name} " );

if ( '' !== $slash ) {
// Closing elements do not get parsed.
return false;
$parsed_attributes = $processor->get_attribute_names_with_prefix( '' );
if ( ! isset( $parsed_attributes ) ) {
$parsed_attributes = array();
}

// Is there a closing XHTML slash at the end of the attributes?
if ( 1 === preg_match( '%\s*/\s*$%', $attr, $matches ) ) {
$xhtml_slash = $matches[0];
$attr = substr( $attr, 0, -strlen( $xhtml_slash ) );
} else {
$xhtml_slash = '';
foreach ( $parsed_attributes as $attribute_name ) {
$value = $processor->get_attribute( $attribute_name );
if ( true === $value ) {
$chunks[] = $attribute_name;
} else {
$value = str_replace( '"', '&quot;', $value );
$chunks[] = "{$attribute_name}=\"{$value}\"";
}
}

// Split it.
$attrarr = wp_kses_hair_parse( $attr );
if ( false === $attrarr ) {
$chunks[] = '>';

/*
* There should have been no more content available in the HTML string.
* If there had been, it would imply that the `$attr` string was
* incorrectly parsed and broke out of the tag segment.
*/
if ( false === $processor->next_token() ) {
return $chunks;
} else {
return false;
}

// Make sure all input is returned by adding front and back matter.
array_unshift( $attrarr, $begin . $slash . $elname );
array_push( $attrarr, $xhtml_slash . $end );

return $attrarr;
}

/**
* Builds an attribute list from string containing attributes.
*
* Does not modify input. May return "evil" output.
* Does not modify input.
* In case of unexpected input, returns false instead of stripping things.
*
* Based on `wp_kses_hair()` but does not return a multi-dimensional array.
*
* @since 4.2.3
* @since 6.6.0 Based on the HTML API.
*
* @param string $attr Attribute list from HTML element to closing HTML element tag.
*
* @return array|false List of attributes found in $attr. Returns false on failure.
*/
function wp_kses_hair_parse( $attr ) {
if ( '' === $attr ) {
$chunks = wp_kses_attr_parse( "<fake-tag {$attr}>" );
if ( false === $chunks ) {
return false;
}

if ( count( $chunks ) <= 2 ) {
return array();
}

$regex =
'(?:
[_a-zA-Z][-_a-zA-Z0-9:.]* # Attribute name.
|
\[\[?[^\[\]]+\]\]? # Shortcode in the name position implies unfiltered_html.
)
(?: # Attribute value.
\s*=\s* # All values begin with "=".
(?:
"[^"]*" # Double-quoted.
|
\'[^\']*\' # Single-quoted.
|
[^\s"\']+ # Non-quoted.
(?:\s|$) # Must have a space.
)
|
(?:\s|$) # If attribute has no value, space is required.
)
\s* # Trailing space is optional except as mentioned above.
';
// Remove the tag opening.
array_shift( $chunks );

/*
* Although it is possible to reduce this procedure to a single regexp,
* we must run that regexp twice to get exactly the expected result.
*
* Note: do NOT remove the `x` modifiers as they are essential for the above regex!
*/

$validation = "/^($regex)+$/x";
$extraction = "/$regex/x";
// Remove the tag closing.
array_pop( $chunks );

if ( 1 === preg_match( $validation, $attr ) ) {
preg_match_all( $extraction, $attr, $attrarr );
return $attrarr[0];
} else {
return false;
}
return $chunks;
}

/**
Expand Down

0 comments on commit 97a55ac

Please sign in to comment.