From 3f71149d19bab2744d9716a786914938c742c81f Mon Sep 17 00:00:00 2001 From: saxmatt Date: Tue, 23 Dec 2003 22:09:02 +0000 Subject: [PATCH] Add kses HTML cleaning. git-svn-id: http://svn.automattic.com/wordpress/trunk@649 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-includes/kses.php | 528 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 528 insertions(+) create mode 100644 wp-includes/kses.php diff --git a/wp-includes/kses.php b/wp-includes/kses.php new file mode 100644 index 0000000000..5b4125b0f1 --- /dev/null +++ b/wp-includes/kses.php @@ -0,0 +1,528 @@ +" characters. +############################################################################### +{ + return preg_replace('%(<'. # EITHER: < + '[^>]*'. # things that aren't > + '(>|$)'. # > or end of string + '|>)%e', # OR: just a > + "kses_split2('\\1', \$allowed_html, ". + '$allowed_protocols)', + $string); +} # function wp_kses_split + + +function wp_kses_split2($string, $allowed_html, $allowed_protocols) +############################################################################### +# This function does a lot of work. It rejects some very malformed things +# like <:::>. It returns an empty string, if the element isn't allowed (look +# ma, no strip_tags()!). Otherwise it splits the tag into an element and an +# attribute list. +############################################################################### +{ + $string = kses_stripslashes($string); + + if (substr($string, 0, 1) != '<') + return '>'; + # It matched a ">" character + + if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?$%', $string, $matches)) + return ''; + # It's seriously malformed + + $slash = trim($matches[1]); + $elem = $matches[2]; + $attrlist = $matches[3]; + + if (!is_array($allowed_html[strtolower($elem)])) + return ''; + # They are using a not allowed HTML element + + return kses_attr("$slash$elem", $attrlist, $allowed_html, + $allowed_protocols); +} # function wp_kses_split2 + + +function wp_kses_attr($element, $attr, $allowed_html, $allowed_protocols) +############################################################################### +# This function removes all attributes, if none are allowed for this element. +# If some are allowed it calls kses_hair() to split them further, and then it +# builds up new HTML code from the data that kses_hair() returns. It also +# removes "<" and ">" characters, if there are any left. One more thing it +# does is to check if the tag has a closing XHTML slash, and if it does, +# it puts one in the returned code as well. +############################################################################### +{ +# Is there a closing XHTML slash at the end of the attributes? + + $xhtml_slash = ''; + if (preg_match('%\s/\s*$%', $attr)) + $xhtml_slash = ' /'; + +# Are any attributes allowed at all for this element? + + if (count($allowed_html[strtolower($element)]) == 0) + return "<$element$xhtml_slash>"; + +# Split it + + $attrarr = kses_hair($attr, $allowed_protocols); + +# Go through $attrarr, and save the allowed attributes for this element +# in $attr2 + + $attr2 = ''; + + foreach ($attrarr as $arreach) + { + $current = $allowed_html[strtolower($element)] + [strtolower($arreach['name'])]; + if ($current == '') + continue; # the attribute is not allowed + + if (!is_array($current)) + $attr2 .= ' '.$arreach['whole']; + # there are no checks + + else + { + # there are some checks + $ok = true; + foreach ($current as $currkey => $currval) + if (!kses_check_attr_val($arreach['value'], $arreach['vless'], + $currkey, $currval)) + { $ok = false; break; } + + if ($ok) + $attr2 .= ' '.$arreach['whole']; # it passed them + } # if !is_array($current) + } # foreach + +# Remove any "<" or ">" characters + + $attr2 = preg_replace('/[<>]/', '', $attr2); + + return "<$element$attr2$xhtml_slash>"; +} # function wp_kses_attr + + +function wp_kses_hair($attr, $allowed_protocols) +############################################################################### +# This function does a lot of work. It parses an attribute list into an array +# with attribute data, and tries to do the right thing even if it gets weird +# input. It will add quotes around attribute values that don't have any quotes +# or apostrophes around them, to make it easier to produce HTML code that will +# conform to W3C's HTML specification. It will also remove bad URL protocols +# from attribute values. +############################################################################### +{ + $attrarr = array(); + $mode = 0; + $attrname = ''; + +# Loop through the whole attribute list + + while (strlen($attr) != 0) + { + $working = 0; # Was the last operation successful? + + switch ($mode) + { + case 0: # attribute name, href for instance + + if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) + { + $attrname = $match[1]; + $working = $mode = 1; + $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr); + } + + break; + + case 1: # equals sign or valueless ("selected") + + if (preg_match('/^\s*=\s*/', $attr)) # equals sign + { + $working = 1; $mode = 2; + $attr = preg_replace('/^\s*=\s*/', '', $attr); + break; + } + + if (preg_match('/^\s+/', $attr)) # valueless + { + $working = 1; $mode = 0; + $attrarr[] = array + ('name' => $attrname, + 'value' => '', + 'whole' => $attrname, + 'vless' => 'y'); + $attr = preg_replace('/^\s+/', '', $attr); + } + + break; + + case 2: # attribute value, a URL after href= for instance + + if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) + # "value" + { + $thisval = kses_bad_protocol($match[1], $allowed_protocols); + + $attrarr[] = array + ('name' => $attrname, + 'value' => $thisval, + 'whole' => "$attrname=\"$thisval\"", + 'vless' => 'n'); + $working = 1; $mode = 0; + $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); + break; + } + + if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) + # 'value' + { + $thisval = kses_bad_protocol($match[1], $allowed_protocols); + + $attrarr[] = array + ('name' => $attrname, + 'value' => $thisval, + 'whole' => "$attrname='$thisval'", + 'vless' => 'n'); + $working = 1; $mode = 0; + $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); + break; + } + + if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) + # value + { + $thisval = kses_bad_protocol($match[1], $allowed_protocols); + + $attrarr[] = array + ('name' => $attrname, + 'value' => $thisval, + 'whole' => "$attrname=\"$thisval\"", + 'vless' => 'n'); + # We add quotes to conform to W3C's HTML spec. + $working = 1; $mode = 0; + $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr); + } + + break; + } # switch + + if ($working == 0) # not well formed, remove and try again + { + $attr = kses_html_error($attr); + $mode = 0; + } + } # while + + if ($mode == 1) + # special case, for when the attribute list ends with a valueless + # attribute like "selected" + $attrarr[] = array + ('name' => $attrname, + 'value' => '', + 'whole' => $attrname, + 'vless' => 'y'); + + return $attrarr; +} # function wp_kses_hair + + +function wp_kses_check_attr_val($value, $vless, $checkname, $checkvalue) +############################################################################### +# This function performs different checks for attribute values. The currently +# implemented checks are "maxlen", "minlen", "maxval", "minval" and "valueless" +# with even more checks to come soon. +############################################################################### +{ + $ok = true; + + switch (strtolower($checkname)) + { + case 'maxlen': + # The maxlen check makes sure that the attribute value has a length not + # greater than the given value. This can be used to avoid Buffer Overflows + # in WWW clients and various Internet servers. + + if (strlen($value) > $checkvalue) + $ok = false; + break; + + case 'minlen': + # The minlen check makes sure that the attribute value has a length not + # smaller than the given value. + + if (strlen($value) < $checkvalue) + $ok = false; + break; + + case 'maxval': + # The maxval check does two things: it checks that the attribute value is + # an integer from 0 and up, without an excessive amount of zeroes or + # whitespace (to avoid Buffer Overflows). It also checks that the attribute + # value is not greater than the given value. + # This check can be used to avoid Denial of Service attacks. + + if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value)) + $ok = false; + if ($value > $checkvalue) + $ok = false; + break; + + case 'minval': + # The minval check checks that the attribute value is a positive integer, + # and that it is not smaller than the given value. + + if (!preg_match('/^\s{0,6}[0-9]{1,6}\s{0,6}$/', $value)) + $ok = false; + if ($value < $checkvalue) + $ok = false; + break; + + case 'valueless': + # The valueless check checks if the attribute has a value + # (like ) or not (