discourse/app/assets/javascripts/defer/html-sanitizer-bundle.js

2266 lines
69 KiB
JavaScript
Raw Normal View History

// Copyright (C) 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview
* Implements RFC 3986 for parsing/formatting URIs.
*
* @author mikesamuel@gmail.com
* \@provides URI
* \@overrides window
*/
var URI = (function () {
/**
* creates a uri from the string form. The parser is relaxed, so special
* characters that aren't escaped but don't cause ambiguities will not cause
* parse failures.
*
* @return {URI|null}
*/
function parse(uriStr) {
var m = ('' + uriStr).match(URI_RE_);
if (!m) { return null; }
return new URI(
nullIfAbsent(m[1]),
nullIfAbsent(m[2]),
nullIfAbsent(m[3]),
nullIfAbsent(m[4]),
nullIfAbsent(m[5]),
nullIfAbsent(m[6]),
nullIfAbsent(m[7]));
}
/**
* creates a uri from the given parts.
*
* @param scheme {string} an unencoded scheme such as "http" or null
* @param credentials {string} unencoded user credentials or null
* @param domain {string} an unencoded domain name or null
* @param port {number} a port number in [1, 32768].
* -1 indicates no port, as does null.
* @param path {string} an unencoded path
* @param query {Array.<string>|string|null} a list of unencoded cgi
* parameters where even values are keys and odds the corresponding values
* or an unencoded query.
* @param fragment {string} an unencoded fragment without the "#" or null.
* @return {URI}
*/
function create(scheme, credentials, domain, port, path, query, fragment) {
var uri = new URI(
encodeIfExists2(scheme, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_),
encodeIfExists2(
credentials, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_),
encodeIfExists(domain),
port > 0 ? port.toString() : null,
encodeIfExists2(path, URI_DISALLOWED_IN_PATH_),
null,
encodeIfExists(fragment));
if (query) {
if ('string' === typeof query) {
uri.setRawQuery(query.replace(/[^?&=0-9A-Za-z_\-~.%]/g, encodeOne));
} else {
uri.setAllParameters(query);
}
}
return uri;
}
function encodeIfExists(unescapedPart) {
if ('string' == typeof unescapedPart) {
return encodeURIComponent(unescapedPart);
}
return null;
};
/**
* if unescapedPart is non null, then escapes any characters in it that aren't
* valid characters in a url and also escapes any special characters that
* appear in extra.
*
* @param unescapedPart {string}
* @param extra {RegExp} a character set of characters in [\01-\177].
* @return {string|null} null iff unescapedPart == null.
*/
function encodeIfExists2(unescapedPart, extra) {
if ('string' == typeof unescapedPart) {
return encodeURI(unescapedPart).replace(extra, encodeOne);
}
return null;
};
/** converts a character in [\01-\177] to its url encoded equivalent. */
function encodeOne(ch) {
var n = ch.charCodeAt(0);
return '%' + '0123456789ABCDEF'.charAt((n >> 4) & 0xf) +
'0123456789ABCDEF'.charAt(n & 0xf);
}
/**
* {@updoc
* $ normPath('foo/./bar')
* # 'foo/bar'
* $ normPath('./foo')
* # 'foo'
* $ normPath('foo/.')
* # 'foo'
* $ normPath('foo//bar')
* # 'foo/bar'
* }
*/
function normPath(path) {
return path.replace(/(^|\/)\.(?:\/|$)/g, '$1').replace(/\/{2,}/g, '/');
}
var PARENT_DIRECTORY_HANDLER = new RegExp(
''
// A path break
+ '(/|^)'
// followed by a non .. path element
// (cannot be . because normPath is used prior to this RegExp)
+ '(?:[^./][^/]*|\\.{2,}(?:[^./][^/]*)|\\.{3,}[^/]*)'
// followed by .. followed by a path break.
+ '/\\.\\.(?:/|$)');
var PARENT_DIRECTORY_HANDLER_RE = new RegExp(PARENT_DIRECTORY_HANDLER);
var EXTRA_PARENT_PATHS_RE = /^(?:\.\.\/)*(?:\.\.$)?/;
/**
* Normalizes its input path and collapses all . and .. sequences except for
* .. sequences that would take it above the root of the current parent
* directory.
* {@updoc
* $ collapse_dots('foo/../bar')
* # 'bar'
* $ collapse_dots('foo/./bar')
* # 'foo/bar'
* $ collapse_dots('foo/../bar/./../../baz')
* # 'baz'
* $ collapse_dots('../foo')
* # '../foo'
* $ collapse_dots('../foo').replace(EXTRA_PARENT_PATHS_RE, '')
* # 'foo'
* }
*/
function collapse_dots(path) {
if (path === null) { return null; }
var p = normPath(path);
// Only /../ left to flatten
var r = PARENT_DIRECTORY_HANDLER_RE;
// We replace with $1 which matches a / before the .. because this
// guarantees that:
// (1) we have at most 1 / between the adjacent place,
// (2) always have a slash if there is a preceding path section, and
// (3) we never turn a relative path into an absolute path.
for (var q; (q = p.replace(r, '$1')) != p; p = q) {};
return p;
}
/**
* resolves a relative url string to a base uri.
* @return {URI}
*/
function resolve(baseUri, relativeUri) {
// there are several kinds of relative urls:
// 1. //foo - replaces everything from the domain on. foo is a domain name
// 2. foo - replaces the last part of the path, the whole query and fragment
// 3. /foo - replaces the the path, the query and fragment
// 4. ?foo - replace the query and fragment
// 5. #foo - replace the fragment only
var absoluteUri = baseUri.clone();
// we satisfy these conditions by looking for the first part of relativeUri
// that is not blank and applying defaults to the rest
var overridden = relativeUri.hasScheme();
if (overridden) {
absoluteUri.setRawScheme(relativeUri.getRawScheme());
} else {
overridden = relativeUri.hasCredentials();
}
if (overridden) {
absoluteUri.setRawCredentials(relativeUri.getRawCredentials());
} else {
overridden = relativeUri.hasDomain();
}
if (overridden) {
absoluteUri.setRawDomain(relativeUri.getRawDomain());
} else {
overridden = relativeUri.hasPort();
}
var rawPath = relativeUri.getRawPath();
var simplifiedPath = collapse_dots(rawPath);
if (overridden) {
absoluteUri.setPort(relativeUri.getPort());
simplifiedPath = simplifiedPath
&& simplifiedPath.replace(EXTRA_PARENT_PATHS_RE, '');
} else {
overridden = !!rawPath;
if (overridden) {
// resolve path properly
if (simplifiedPath.charCodeAt(0) !== 0x2f /* / */) { // path is relative
var absRawPath = collapse_dots(absoluteUri.getRawPath() || '')
.replace(EXTRA_PARENT_PATHS_RE, '');
var slash = absRawPath.lastIndexOf('/') + 1;
simplifiedPath = collapse_dots(
(slash ? absRawPath.substring(0, slash) : '')
+ collapse_dots(rawPath))
.replace(EXTRA_PARENT_PATHS_RE, '');
}
} else {
simplifiedPath = simplifiedPath
&& simplifiedPath.replace(EXTRA_PARENT_PATHS_RE, '');
if (simplifiedPath !== rawPath) {
absoluteUri.setRawPath(simplifiedPath);
}
}
}
if (overridden) {
absoluteUri.setRawPath(simplifiedPath);
} else {
overridden = relativeUri.hasQuery();
}
if (overridden) {
absoluteUri.setRawQuery(relativeUri.getRawQuery());
} else {
overridden = relativeUri.hasFragment();
}
if (overridden) {
absoluteUri.setRawFragment(relativeUri.getRawFragment());
}
return absoluteUri;
}
/**
* a mutable URI.
*
* This class contains setters and getters for the parts of the URI.
* The <tt>getXYZ</tt>/<tt>setXYZ</tt> methods return the decoded part -- so
* <code>uri.parse('/foo%20bar').getPath()</code> will return the decoded path,
* <tt>/foo bar</tt>.
*
* <p>The raw versions of fields are available too.
* <code>uri.parse('/foo%20bar').getRawPath()</code> will return the raw path,
* <tt>/foo%20bar</tt>. Use the raw setters with care, since
* <code>URI::toString</code> is not guaranteed to return a valid url if a
* raw setter was used.
*
* <p>All setters return <tt>this</tt> and so may be chained, a la
* <code>uri.parse('/foo').setFragment('part').toString()</code>.
*
* <p>You should not use this constructor directly -- please prefer the factory
* functions {@link uri.parse}, {@link uri.create}, {@link uri.resolve}
* instead.</p>
*
* <p>The parameters are all raw (assumed to be properly escaped) parts, and
* any (but not all) may be null. Undefined is not allowed.</p>
*
* @constructor
*/
function URI(
rawScheme,
rawCredentials, rawDomain, port,
rawPath, rawQuery, rawFragment) {
this.scheme_ = rawScheme;
this.credentials_ = rawCredentials;
this.domain_ = rawDomain;
this.port_ = port;
this.path_ = rawPath;
this.query_ = rawQuery;
this.fragment_ = rawFragment;
/**
* @type {Array|null}
*/
this.paramCache_ = null;
}
/** returns the string form of the url. */
URI.prototype.toString = function () {
var out = [];
if (null !== this.scheme_) { out.push(this.scheme_, ':'); }
if (null !== this.domain_) {
out.push('//');
if (null !== this.credentials_) { out.push(this.credentials_, '@'); }
out.push(this.domain_);
if (null !== this.port_) { out.push(':', this.port_.toString()); }
}
if (null !== this.path_) { out.push(this.path_); }
if (null !== this.query_) { out.push('?', this.query_); }
if (null !== this.fragment_) { out.push('#', this.fragment_); }
return out.join('');
};
URI.prototype.clone = function () {
return new URI(this.scheme_, this.credentials_, this.domain_, this.port_,
this.path_, this.query_, this.fragment_);
};
URI.prototype.getScheme = function () {
// HTML5 spec does not require the scheme to be lowercased but
// all common browsers except Safari lowercase the scheme.
return this.scheme_ && decodeURIComponent(this.scheme_).toLowerCase();
};
URI.prototype.getRawScheme = function () {
return this.scheme_;
};
URI.prototype.setScheme = function (newScheme) {
this.scheme_ = encodeIfExists2(
newScheme, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_);
return this;
};
URI.prototype.setRawScheme = function (newScheme) {
this.scheme_ = newScheme ? newScheme : null;
return this;
};
URI.prototype.hasScheme = function () {
return null !== this.scheme_;
};
URI.prototype.getCredentials = function () {
return this.credentials_ && decodeURIComponent(this.credentials_);
};
URI.prototype.getRawCredentials = function () {
return this.credentials_;
};
URI.prototype.setCredentials = function (newCredentials) {
this.credentials_ = encodeIfExists2(
newCredentials, URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_);
return this;
};
URI.prototype.setRawCredentials = function (newCredentials) {
this.credentials_ = newCredentials ? newCredentials : null;
return this;
};
URI.prototype.hasCredentials = function () {
return null !== this.credentials_;
};
URI.prototype.getDomain = function () {
return this.domain_ && decodeURIComponent(this.domain_);
};
URI.prototype.getRawDomain = function () {
return this.domain_;
};
URI.prototype.setDomain = function (newDomain) {
return this.setRawDomain(newDomain && encodeURIComponent(newDomain));
};
URI.prototype.setRawDomain = function (newDomain) {
this.domain_ = newDomain ? newDomain : null;
// Maintain the invariant that paths must start with a slash when the URI
// is not path-relative.
return this.setRawPath(this.path_);
};
URI.prototype.hasDomain = function () {
return null !== this.domain_;
};
URI.prototype.getPort = function () {
return this.port_ && decodeURIComponent(this.port_);
};
URI.prototype.setPort = function (newPort) {
if (newPort) {
newPort = Number(newPort);
if (newPort !== (newPort & 0xffff)) {
throw new Error('Bad port number ' + newPort);
}
this.port_ = '' + newPort;
} else {
this.port_ = null;
}
return this;
};
URI.prototype.hasPort = function () {
return null !== this.port_;
};
URI.prototype.getPath = function () {
return this.path_ && decodeURIComponent(this.path_);
};
URI.prototype.getRawPath = function () {
return this.path_;
};
URI.prototype.setPath = function (newPath) {
return this.setRawPath(encodeIfExists2(newPath, URI_DISALLOWED_IN_PATH_));
};
URI.prototype.setRawPath = function (newPath) {
if (newPath) {
newPath = String(newPath);
2013-11-29 12:08:53 -05:00
this.path_ =
// Paths must start with '/' unless this is a path-relative URL.
(!this.domain_ || /^\//.test(newPath)) ? newPath : '/' + newPath;
} else {
this.path_ = null;
}
return this;
};
URI.prototype.hasPath = function () {
return null !== this.path_;
};
URI.prototype.getQuery = function () {
// From http://www.w3.org/Addressing/URL/4_URI_Recommentations.html
// Within the query string, the plus sign is reserved as shorthand notation
// for a space.
return this.query_ && decodeURIComponent(this.query_).replace(/\+/g, ' ');
};
URI.prototype.getRawQuery = function () {
return this.query_;
};
URI.prototype.setQuery = function (newQuery) {
this.paramCache_ = null;
this.query_ = encodeIfExists(newQuery);
return this;
};
URI.prototype.setRawQuery = function (newQuery) {
this.paramCache_ = null;
this.query_ = newQuery ? newQuery : null;
return this;
};
URI.prototype.hasQuery = function () {
return null !== this.query_;
};
/**
* sets the query given a list of strings of the form
* [ key0, value0, key1, value1, ... ].
*
* <p><code>uri.setAllParameters(['a', 'b', 'c', 'd']).getQuery()</code>
* will yield <code>'a=b&c=d'</code>.
*/
URI.prototype.setAllParameters = function (params) {
if (typeof params === 'object') {
if (!(params instanceof Array)
&& (params instanceof Object
|| Object.prototype.toString.call(params) !== '[object Array]')) {
var newParams = [];
var i = -1;
for (var k in params) {
var v = params[k];
if ('string' === typeof v) {
newParams[++i] = k;
newParams[++i] = v;
}
}
params = newParams;
}
}
this.paramCache_ = null;
var queryBuf = [];
var separator = '';
for (var j = 0; j < params.length;) {
var k = params[j++];
var v = params[j++];
queryBuf.push(separator, encodeURIComponent(k.toString()));
separator = '&';
if (v) {
queryBuf.push('=', encodeURIComponent(v.toString()));
}
}
this.query_ = queryBuf.join('');
return this;
};
URI.prototype.checkParameterCache_ = function () {
if (!this.paramCache_) {
var q = this.query_;
if (!q) {
this.paramCache_ = [];
} else {
var cgiParams = q.split(/[&\?]/);
var out = [];
var k = -1;
for (var i = 0; i < cgiParams.length; ++i) {
var m = cgiParams[i].match(/^([^=]*)(?:=(.*))?$/);
// From http://www.w3.org/Addressing/URL/4_URI_Recommentations.html
// Within the query string, the plus sign is reserved as shorthand
// notation for a space.
out[++k] = decodeURIComponent(m[1]).replace(/\+/g, ' ');
out[++k] = decodeURIComponent(m[2] || '').replace(/\+/g, ' ');
}
this.paramCache_ = out;
}
}
};
/**
* sets the values of the named cgi parameters.
*
* <p>So, <code>uri.parse('foo?a=b&c=d&e=f').setParameterValues('c', ['new'])
* </code> yields <tt>foo?a=b&c=new&e=f</tt>.</p>
*
* @param key {string}
* @param values {Array.<string>} the new values. If values is a single string
* then it will be treated as the sole value.
*/
URI.prototype.setParameterValues = function (key, values) {
// be nice and avoid subtle bugs where [] operator on string performs charAt
// on some browsers and crashes on IE
if (typeof values === 'string') {
values = [ values ];
}
this.checkParameterCache_();
var newValueIndex = 0;
var pc = this.paramCache_;
var params = [];
for (var i = 0, k = 0; i < pc.length; i += 2) {
if (key === pc[i]) {
if (newValueIndex < values.length) {
params.push(key, values[newValueIndex++]);
}
} else {
params.push(pc[i], pc[i + 1]);
}
}
while (newValueIndex < values.length) {
params.push(key, values[newValueIndex++]);
}
this.setAllParameters(params);
return this;
};
URI.prototype.removeParameter = function (key) {
return this.setParameterValues(key, []);
};
/**
* returns the parameters specified in the query part of the uri as a list of
* keys and values like [ key0, value0, key1, value1, ... ].
*
* @return {Array.<string>}
*/
URI.prototype.getAllParameters = function () {
this.checkParameterCache_();
return this.paramCache_.slice(0, this.paramCache_.length);
};
/**
* returns the value<b>s</b> for a given cgi parameter as a list of decoded
* query parameter values.
* @return {Array.<string>}
*/
URI.prototype.getParameterValues = function (paramNameUnescaped) {
this.checkParameterCache_();
var values = [];
for (var i = 0; i < this.paramCache_.length; i += 2) {
if (paramNameUnescaped === this.paramCache_[i]) {
values.push(this.paramCache_[i + 1]);
}
}
return values;
};
/**
* returns a map of cgi parameter names to (non-empty) lists of values.
* @return {Object.<string,Array.<string>>}
*/
URI.prototype.getParameterMap = function (paramNameUnescaped) {
this.checkParameterCache_();
var paramMap = {};
for (var i = 0; i < this.paramCache_.length; i += 2) {
var key = this.paramCache_[i++],
value = this.paramCache_[i++];
if (!(key in paramMap)) {
paramMap[key] = [value];
} else {
paramMap[key].push(value);
}
}
return paramMap;
};
/**
* returns the first value for a given cgi parameter or null if the given
* parameter name does not appear in the query string.
* If the given parameter name does appear, but has no '<tt>=</tt>' following
* it, then the empty string will be returned.
* @return {string|null}
*/
URI.prototype.getParameterValue = function (paramNameUnescaped) {
this.checkParameterCache_();
for (var i = 0; i < this.paramCache_.length; i += 2) {
if (paramNameUnescaped === this.paramCache_[i]) {
return this.paramCache_[i + 1];
}
}
return null;
};
URI.prototype.getFragment = function () {
return this.fragment_ && decodeURIComponent(this.fragment_);
};
URI.prototype.getRawFragment = function () {
return this.fragment_;
};
URI.prototype.setFragment = function (newFragment) {
this.fragment_ = newFragment ? encodeURIComponent(newFragment) : null;
return this;
};
URI.prototype.setRawFragment = function (newFragment) {
this.fragment_ = newFragment ? newFragment : null;
return this;
};
URI.prototype.hasFragment = function () {
return null !== this.fragment_;
};
function nullIfAbsent(matchPart) {
return ('string' == typeof matchPart) && (matchPart.length > 0)
? matchPart
: null;
}
/**
* a regular expression for breaking a URI into its component parts.
*
* <p>http://www.gbiv.com/protocols/uri/rfc/rfc3986.html#RFC2234 says
* As the "first-match-wins" algorithm is identical to the "greedy"
* disambiguation method used by POSIX regular expressions, it is natural and
* commonplace to use a regular expression for parsing the potential five
* components of a URI reference.
*
* <p>The following line is the regular expression for breaking-down a
* well-formed URI reference into its components.
*
* <pre>
* ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
* 12 3 4 5 6 7 8 9
* </pre>
*
* <p>The numbers in the second line above are only to assist readability; they
* indicate the reference points for each subexpression (i.e., each paired
* parenthesis). We refer to the value matched for subexpression <n> as $<n>.
* For example, matching the above expression to
* <pre>
* http://www.ics.uci.edu/pub/ietf/uri/#Related
* </pre>
* results in the following subexpression matches:
* <pre>
* $1 = http:
* $2 = http
* $3 = //www.ics.uci.edu
* $4 = www.ics.uci.edu
* $5 = /pub/ietf/uri/
* $6 = <undefined>
* $7 = <undefined>
* $8 = #Related
* $9 = Related
* </pre>
* where <undefined> indicates that the component is not present, as is the
* case for the query component in the above example. Therefore, we can
* determine the value of the five components as
* <pre>
* scheme = $2
* authority = $4
* path = $5
* query = $7
* fragment = $9
* </pre>
*
* <p>msamuel: I have modified the regular expression slightly to expose the
* credentials, domain, and port separately from the authority.
* The modified version yields
* <pre>
* $1 = http scheme
* $2 = <undefined> credentials -\
* $3 = www.ics.uci.edu domain | authority
* $4 = <undefined> port -/
* $5 = /pub/ietf/uri/ path
* $6 = <undefined> query without ?
* $7 = Related fragment without #
* </pre>
*/
var URI_RE_ = new RegExp(
"^" +
"(?:" +
"([^:/?#]+)" + // scheme
":)?" +
"(?://" +
"(?:([^/?#]*)@)?" + // credentials
"([^/?#:@]*)" + // domain
"(?::([0-9]+))?" + // port
")?" +
"([^?#]+)?" + // path
"(?:\\?([^#]*))?" + // query
"(?:#(.*))?" + // fragment
"$"
);
var URI_DISALLOWED_IN_SCHEME_OR_CREDENTIALS_ = /[#\/\?@]/g;
var URI_DISALLOWED_IN_PATH_ = /[\#\?]/g;
URI.parse = parse;
URI.create = create;
URI.resolve = resolve;
URI.collapse_dots = collapse_dots; // Visible for testing.
// lightweight string-based api for loadModuleMaker
URI.utils = {
mimeTypeOf: function (uri) {
var uriObj = parse(uri);
if (/\.html$/.test(uriObj.getPath())) {
return 'text/html';
} else {
return 'application/javascript';
}
},
resolve: function (base, uri) {
if (base) {
return resolve(parse(base), parse(uri)).toString();
} else {
return '' + uri;
}
}
};
return URI;
})();
// Exports for closure compiler.
if (typeof window !== 'undefined') {
window['URI'] = URI;
}
;
// Copyright Google Inc.
// Licensed under the Apache Licence Version 2.0
// Autogenerated at Mon Oct 21 13:30:08 EDT 2013
// @overrides window
// @provides html4
var html4 = {};
html4.atype = {
'NONE': 0,
'URI': 1,
'URI_FRAGMENT': 11,
'SCRIPT': 2,
'STYLE': 3,
'HTML': 12,
'ID': 4,
'IDREF': 5,
'IDREFS': 6,
'GLOBAL_NAME': 7,
'LOCAL_NAME': 8,
'CLASSES': 9,
'FRAME_TARGET': 10,
'MEDIA_QUERY': 13
};
html4[ 'atype' ] = html4.atype;
html4.ATTRIBS = {
'*::class': 9,
'*::dir': 0,
'*::draggable': 0,
'*::hidden': 0,
'*::id': 4,
'*::inert': 0,
'*::itemprop': 0,
'*::itemref': 6,
'*::itemscope': 0,
'*::lang': 0,
'*::onblur': 2,
'*::onchange': 2,
'*::onclick': 2,
'*::ondblclick': 2,
'*::onerror': 2,
'*::onfocus': 2,
'*::onkeydown': 2,
'*::onkeypress': 2,
'*::onkeyup': 2,
'*::onload': 2,
'*::onmousedown': 2,
'*::onmousemove': 2,
'*::onmouseout': 2,
'*::onmouseover': 2,
'*::onmouseup': 2,
'*::onreset': 2,
'*::onscroll': 2,
'*::onselect': 2,
'*::onsubmit': 2,
'*::onunload': 2,
'*::spellcheck': 0,
'*::style': 3,
'*::title': 0,
'*::translate': 0,
'a::accesskey': 0,
'a::coords': 0,
'a::href': 1,
'a::hreflang': 0,
'a::name': 7,
'a::onblur': 2,
'a::onfocus': 2,
'a::shape': 0,
'a::tabindex': 0,
'a::target': 10,
'a::type': 0,
'bdo::dir': 0,
'blockquote::cite': 1,
'br::clear': 0,
'caption::align': 0,
'col::align': 0,
'col::char': 0,
'col::charoff': 0,
'col::span': 0,
'col::valign': 0,
'col::width': 0,
'colgroup::align': 0,
'colgroup::char': 0,
'colgroup::charoff': 0,
'colgroup::span': 0,
'colgroup::valign': 0,
'colgroup::width': 0,
'data::value': 0,
'del::cite': 1,
'del::datetime': 0,
'details::open': 0,
'dir::compact': 0,
'div::align': 0,
'dl::compact': 0,
'font::color': 0,
'font::face': 0,
'font::size': 0,
'h1::align': 0,
'h2::align': 0,
'h3::align': 0,
'h4::align': 0,
'h5::align': 0,
'h6::align': 0,
'hr::align': 0,
'hr::noshade': 0,
'hr::size': 0,
'hr::width': 0,
'iframe::align': 0,
'iframe::frameborder': 0,
'iframe::height': 0,
'iframe::marginheight': 0,
'iframe::marginwidth': 0,
'iframe::width': 0,
2013-11-29 12:08:53 -05:00
'iframe::src': 1,
'img::align': 0,
'img::alt': 0,
'img::border': 0,
'img::height': 0,
'img::hspace': 0,
'img::ismap': 0,
'img::name': 7,
'img::src': 1,
'img::vspace': 0,
'img::width': 0,
'ins::cite': 1,
'ins::datetime': 0,
'label::accesskey': 0,
'label::for': 5,
'label::onblur': 2,
'label::onfocus': 2,
'legend::accesskey': 0,
'legend::align': 0,
'li::type': 0,
'li::value': 0,
'meter::high': 0,
'meter::low': 0,
'meter::max': 0,
'meter::min': 0,
'meter::value': 0,
'ol::compact': 0,
'ol::reversed': 0,
'ol::start': 0,
'ol::type': 0,
'p::align': 0,
'pre::width': 0,
'progress::max': 0,
'progress::min': 0,
'progress::value': 0,
'q::cite': 1,
'source::type': 0,
'track::default': 0,
'track::kind': 0,
'track::label': 0,
'track::srclang': 0,
'ul::compact': 0,
'ul::type': 0,
};
html4[ 'ATTRIBS' ] = html4.ATTRIBS;
html4.eflags = {
'OPTIONAL_ENDTAG': 1,
'EMPTY': 2,
'CDATA': 4,
'RCDATA': 8,
'UNSAFE': 16,
'FOLDABLE': 32,
'SCRIPT': 64,
'STYLE': 128,
'VIRTUALIZED': 256
};
html4[ 'eflags' ] = html4.eflags;
html4.ELEMENTS = {
'a': 0,
'abbr': 0,
'acronym': 0,
'address': 0,
2014-02-18 19:22:42 -05:00
'article': 0,
'aside': 0,
'b': 0,
'base': 274,
'bdi': 0,
'bdo': 0,
'big': 0,
'blockquote': 0,
'body': 305,
'br': 2,
'caption': 0,
'cite': 0,
'code': 0,
'col': 2,
'colgroup': 1,
'data': 0,
'dd': 1,
'del': 0,
'details': 0,
'dfn': 0,
'dialog': 272,
'dir': 0,
'div': 0,
'dl': 0,
'dt': 1,
'em': 0,
'figcaption': 0,
'figure': 0,
'font': 0,
'frame': 274,
'frameset': 272,
'h1': 0,
'h2': 0,
'h3': 0,
'h4': 0,
'h5': 0,
'h6': 0,
'head': 305,
'header': 0,
'hgroup': 0,
'hr': 2,
'html': 305,
'i': 0,
'iframe': 4,
'img': 2,
'ins': 0,
'isindex': 274,
'kbd': 0,
'keygen': 274,
'label': 0,
'legend': 0,
'li': 1,
'link': 274,
'mark': 0,
'meter': 0,
'nav': 0,
'nobr': 0,
'noembed': 276,
'noframes': 276,
'noscript': 276,
'object': 272,
'ol': 0,
'p': 1,
'param': 274,
'pre': 0,
'progress': 0,
'q': 0,
's': 0,
'samp': 0,
'script': 84,
'section': 0,
'small': 0,
'span': 0,
'strike': 0,
'strong': 0,
'style': 148,
'sub': 0,
'summary': 0,
'sup': 0,
'table': 272,
'tbody': 273,
'td': 273,
'tfoot': 1,
'th': 273,
'thead': 273,
'time': 0,
'title': 280,
'tr': 273,
'track': 2,
'tt': 0,
'u': 0,
'ul': 0,
'var': 0,
'wbr': 2
};
html4[ 'ELEMENTS' ] = html4.ELEMENTS;
html4.ELEMENT_DOM_INTERFACES = {
'a': 'HTMLAnchorElement',
'abbr': 'HTMLElement',
'acronym': 'HTMLElement',
'address': 'HTMLElement',
'applet': 'HTMLAppletElement',
'area': 'HTMLAreaElement',
'article': 'HTMLElement',
'aside': 'HTMLElement',
'audio': 'HTMLAudioElement',
'b': 'HTMLElement',
'base': 'HTMLBaseElement',
'basefont': 'HTMLBaseFontElement',
'bdi': 'HTMLElement',
'bdo': 'HTMLElement',
'big': 'HTMLElement',
'blockquote': 'HTMLQuoteElement',
'body': 'HTMLBodyElement',
'br': 'HTMLBRElement',
'caption': 'HTMLTableCaptionElement',
'cite': 'HTMLElement',
'code': 'HTMLElement',
'col': 'HTMLTableColElement',
'colgroup': 'HTMLTableColElement',
'command': 'HTMLCommandElement',
'data': 'HTMLElement',
'datalist': 'HTMLDataListElement',
'dd': 'HTMLElement',
'del': 'HTMLModElement',
'details': 'HTMLDetailsElement',
'dfn': 'HTMLElement',
'dialog': 'HTMLDialogElement',
'dir': 'HTMLDirectoryElement',
'div': 'HTMLDivElement',
'dl': 'HTMLDListElement',
'dt': 'HTMLElement',
'em': 'HTMLElement',
'fieldset': 'HTMLFieldSetElement',
'figcaption': 'HTMLElement',
'figure': 'HTMLElement',
'font': 'HTMLFontElement',
'footer': 'HTMLElement',
'form': 'HTMLFormElement',
'frame': 'HTMLFrameElement',
'frameset': 'HTMLFrameSetElement',
'h1': 'HTMLHeadingElement',
'h2': 'HTMLHeadingElement',
'h3': 'HTMLHeadingElement',
'h4': 'HTMLHeadingElement',
'h5': 'HTMLHeadingElement',
'h6': 'HTMLHeadingElement',
'head': 'HTMLHeadElement',
'header': 'HTMLElement',
'hgroup': 'HTMLElement',
'hr': 'HTMLHRElement',
'html': 'HTMLHtmlElement',
'i': 'HTMLElement',
'iframe': 'HTMLIFrameElement',
'img': 'HTMLImageElement',
'input': 'HTMLInputElement',
'ins': 'HTMLModElement',
'isindex': 'HTMLUnknownElement',
'kbd': 'HTMLElement',
'keygen': 'HTMLKeygenElement',
'label': 'HTMLLabelElement',
'legend': 'HTMLLegendElement',
'li': 'HTMLLIElement',
'link': 'HTMLLinkElement',
'map': 'HTMLMapElement',
'mark': 'HTMLElement',
'menu': 'HTMLMenuElement',
'meta': 'HTMLMetaElement',
'meter': 'HTMLMeterElement',
'nav': 'HTMLElement',
'nobr': 'HTMLElement',
'noembed': 'HTMLElement',
'noframes': 'HTMLElement',
'noscript': 'HTMLElement',
'object': 'HTMLObjectElement',
'ol': 'HTMLOListElement',
'optgroup': 'HTMLOptGroupElement',
'option': 'HTMLOptionElement',
'output': 'HTMLOutputElement',
'p': 'HTMLParagraphElement',
'param': 'HTMLParamElement',
'pre': 'HTMLPreElement',
'progress': 'HTMLProgressElement',
'q': 'HTMLQuoteElement',
's': 'HTMLElement',
'samp': 'HTMLElement',
'script': 'HTMLScriptElement',
'section': 'HTMLElement',
'select': 'HTMLSelectElement',
'small': 'HTMLElement',
'source': 'HTMLSourceElement',
'span': 'HTMLSpanElement',
'strike': 'HTMLElement',
'strong': 'HTMLElement',
'style': 'HTMLStyleElement',
'sub': 'HTMLElement',
'summary': 'HTMLElement',
'sup': 'HTMLElement',
'table': 'HTMLTableElement',
'tbody': 'HTMLTableSectionElement',
'td': 'HTMLTableDataCellElement',
'tfoot': 'HTMLTableSectionElement',
'th': 'HTMLTableHeaderCellElement',
'thead': 'HTMLTableSectionElement',
'time': 'HTMLTimeElement',
'title': 'HTMLTitleElement',
'tr': 'HTMLTableRowElement',
'track': 'HTMLTrackElement',
'tt': 'HTMLElement',
'u': 'HTMLElement',
'ul': 'HTMLUListElement',
'var': 'HTMLElement',
'video': 'HTMLVideoElement',
'wbr': 'HTMLElement'
};
html4[ 'ELEMENT_DOM_INTERFACES' ] = html4.ELEMENT_DOM_INTERFACES;
html4.ueffects = {
'NOT_LOADED': 0,
'SAME_DOCUMENT': 1,
'NEW_DOCUMENT': 2
};
html4[ 'ueffects' ] = html4.ueffects;
html4.URIEFFECTS = {
'a::href': 2,
'area::href': 2,
'audio::src': 1,
'blockquote::cite': 0,
'command::icon': 1,
'del::cite': 0,
'form::action': 2,
2013-11-29 12:08:53 -05:00
'iframe::src': 1,
'img::src': 1,
'input::src': 1,
'ins::cite': 0,
'q::cite': 0,
'video::poster': 1,
'video::src': 1
};
html4[ 'URIEFFECTS' ] = html4.URIEFFECTS;
html4.ltypes = {
'UNSANDBOXED': 2,
'SANDBOXED': 1,
'DATA': 0
};
html4[ 'ltypes' ] = html4.ltypes;
html4.LOADERTYPES = {
'a::href': 2,
'area::href': 2,
'audio::src': 2,
'blockquote::cite': 2,
'command::icon': 1,
'del::cite': 2,
'form::action': 2,
2013-11-29 12:08:53 -05:00
'iframe::src': 2,
'img::src': 1,
'input::src': 1,
'ins::cite': 2,
'q::cite': 2,
'video::poster': 1,
'video::src': 2
};
html4[ 'LOADERTYPES' ] = html4.LOADERTYPES;
2013-11-29 12:08:53 -05:00
// NOTE: currently focused only on URI-type attributes
html4.REQUIREDATTRIBUTES = {
"audio" : ["src"],
"form" : ["action"],
"iframe" : ["src"],
"image" : ["src"],
"video" : ["src"]
};
html4[ 'REQUIREDATTRIBUTES' ] = html4.REQUIREDATTRIBUTES;
// export for Closure Compiler
if (typeof window !== 'undefined') {
window['html4'] = html4;
}
;
// Copyright (C) 2006 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview
* An HTML sanitizer that can satisfy a variety of security policies.
*
* <p>
* The HTML sanitizer is built around a SAX parser and HTML element and
* attributes schemas.
*
* If the cssparser is loaded, inline styles are sanitized using the
* css property and value schemas. Else they are remove during
* sanitization.
*
* If it exists, uses parseCssDeclarations, sanitizeCssProperty, cssSchema
*
* @author mikesamuel@gmail.com
* @author jasvir@gmail.com
* \@requires html4, URI
* \@overrides window
* \@provides html, html_sanitize
*/
// The Turkish i seems to be a non-issue, but abort in case it is.
if ('I'.toLowerCase() !== 'i') { throw 'I/i problem'; }
/**
* \@namespace
*/
var html = (function(html4) {
// For closure compiler
var parseCssDeclarations, sanitizeCssProperty, cssSchema;
if ('undefined' !== typeof window) {
parseCssDeclarations = window['parseCssDeclarations'];
sanitizeCssProperty = window['sanitizeCssProperty'];
cssSchema = window['cssSchema'];
}
// The keys of this object must be 'quoted' or JSCompiler will mangle them!
// This is a partial list -- lookupEntity() uses the host browser's parser
// (when available) to implement full entity lookup.
// Note that entities are in general case-sensitive; the uppercase ones are
// explicitly defined by HTML5 (presumably as compatibility).
var ENTITIES = {
'lt': '<',
'LT': '<',
'gt': '>',
'GT': '>',
'amp': '&',
'AMP': '&',
'quot': '"',
'apos': '\'',
'nbsp': '\240'
};
// Patterns for types of entity/character reference names.
var decimalEscapeRe = /^#(\d+)$/;
var hexEscapeRe = /^#x([0-9A-Fa-f]+)$/;
// contains every entity per http://www.w3.org/TR/2011/WD-html5-20110113/named-character-references.html
var safeEntityNameRe = /^[A-Za-z][A-za-z0-9]+$/;
// Used as a hook to invoke the browser's entity parsing. <textarea> is used
// because its content is parsed for entities but not tags.
// TODO(kpreid): This retrieval is a kludge and leads to silent loss of
// functionality if the document isn't available.
var entityLookupElement =
('undefined' !== typeof window && window['document'])
? window['document'].createElement('textarea') : null;
/**
* Decodes an HTML entity.
*
* {\@updoc
* $ lookupEntity('lt')
* # '<'
* $ lookupEntity('GT')
* # '>'
* $ lookupEntity('amp')
* # '&'
* $ lookupEntity('nbsp')
* # '\xA0'
* $ lookupEntity('apos')
* # "'"
* $ lookupEntity('quot')
* # '"'
* $ lookupEntity('#xa')
* # '\n'
* $ lookupEntity('#10')
* # '\n'
* $ lookupEntity('#x0a')
* # '\n'
* $ lookupEntity('#010')
* # '\n'
* $ lookupEntity('#x00A')
* # '\n'
* $ lookupEntity('Pi') // Known failure
* # '\u03A0'
* $ lookupEntity('pi') // Known failure
* # '\u03C0'
* }
*
* @param {string} name the content between the '&' and the ';'.
* @return {string} a single unicode code-point as a string.
*/
function lookupEntity(name) {
// TODO: entity lookup as specified by HTML5 actually depends on the
// presence of the ";".
if (ENTITIES.hasOwnProperty(name)) { return ENTITIES[name]; }
var m = name.match(decimalEscapeRe);
if (m) {
return String.fromCharCode(parseInt(m[1], 10));
} else if (!!(m = name.match(hexEscapeRe))) {
return String.fromCharCode(parseInt(m[1], 16));
} else if (entityLookupElement && safeEntityNameRe.test(name)) {
entityLookupElement.innerHTML = '&' + name + ';';
var text = entityLookupElement.textContent;
ENTITIES[name] = text;
return text;
} else {
return '&' + name + ';';
}
}
function decodeOneEntity(_, name) {
return lookupEntity(name);
}
var nulRe = /\0/g;
function stripNULs(s) {
return s.replace(nulRe, '');
}
var ENTITY_RE_1 = /&(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/g;
var ENTITY_RE_2 = /^(#[0-9]+|#[xX][0-9A-Fa-f]+|\w+);/;
/**
* The plain text of a chunk of HTML CDATA which possibly containing.
*
* {\@updoc
* $ unescapeEntities('')
* # ''
* $ unescapeEntities('hello World!')
* # 'hello World!'
* $ unescapeEntities('1 &lt; 2 &amp;&AMP; 4 &gt; 3&#10;')
* # '1 < 2 && 4 > 3\n'
* $ unescapeEntities('&lt;&lt <- unfinished entity&gt;')
* # '<&lt <- unfinished entity>'
* $ unescapeEntities('/foo?bar=baz&copy=true') // & often unescaped in URLS
* # '/foo?bar=baz&copy=true'
* $ unescapeEntities('pi=&pi;&#x3c0;, Pi=&Pi;\u03A0') // FIXME: known failure
* # 'pi=\u03C0\u03c0, Pi=\u03A0\u03A0'
* }
*
* @param {string} s a chunk of HTML CDATA. It must not start or end inside
* an HTML entity.
*/
function unescapeEntities(s) {
return s.replace(ENTITY_RE_1, decodeOneEntity);
}
var ampRe = /&/g;
var looseAmpRe = /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi;
var ltRe = /[<]/g;
var gtRe = />/g;
var quotRe = /\"/g;
/**
* Escapes HTML special characters in attribute values.
*
* {\@updoc
* $ escapeAttrib('')
* # ''
* $ escapeAttrib('"<<&==&>>"') // Do not just escape the first occurrence.
* # '&#34;&lt;&lt;&amp;&#61;&#61;&amp;&gt;&gt;&#34;'
* $ escapeAttrib('Hello <World>!')
* # 'Hello &lt;World&gt;!'
* }
*/
function escapeAttrib(s) {
return ('' + s).replace(ampRe, '&amp;').replace(ltRe, '&lt;')
.replace(gtRe, '&gt;').replace(quotRe, '&#34;');
}
/**
* Escape entities in RCDATA that can be escaped without changing the meaning.
* {\@updoc
* $ normalizeRCData('1 < 2 &&amp; 3 > 4 &amp;& 5 &lt; 7&8')
* # '1 &lt; 2 &amp;&amp; 3 &gt; 4 &amp;&amp; 5 &lt; 7&amp;8'
* }
*/
function normalizeRCData(rcdata) {
return rcdata
.replace(looseAmpRe, '&amp;$1')
.replace(ltRe, '&lt;')
.replace(gtRe, '&gt;');
}
// TODO(felix8a): validate sanitizer regexs against the HTML5 grammar at
// http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html
// We initially split input so that potentially meaningful characters
// like '<' and '>' are separate tokens, using a fast dumb process that
// ignores quoting. Then we walk that token stream, and when we see a
// '<' that's the start of a tag, we use ATTR_RE to extract tag
// attributes from the next token. That token will never have a '>'
// character. However, it might have an unbalanced quote character, and
// when we see that, we combine additional tokens to balance the quote.
var ATTR_RE = new RegExp(
'^\\s*' +
'([-.:\\w]+)' + // 1 = Attribute name
'(?:' + (
'\\s*(=)\\s*' + // 2 = Is there a value?
'(' + ( // 3 = Attribute value
// TODO(felix8a): maybe use backref to match quotes
'(\")[^\"]*(\"|$)' + // 4, 5 = Double-quoted string
'|' +
'(\')[^\']*(\'|$)' + // 6, 7 = Single-quoted string
'|' +
// Positive lookahead to prevent interpretation of
// <foo a= b=c> as <foo a='b=c'>
// TODO(felix8a): might be able to drop this case
'(?=[a-z][-\\w]*\\s*=)' +
'|' +
// Unquoted value that isn't an attribute name
// (since we didn't match the positive lookahead above)
'[^\"\'\\s]*' ) +
')' ) +
')?',
'i');
// false on IE<=8, true on most other browsers
var splitWillCapture = ('a,b'.split(/(,)/).length === 3);
// bitmask for tags with special parsing, like <script> and <textarea>
var EFLAGS_TEXT = html4.eflags['CDATA'] | html4.eflags['RCDATA'];
/**
* Given a SAX-like event handler, produce a function that feeds those
* events and a parameter to the event handler.
*
* The event handler has the form:{@code
* {
* // Name is an upper-case HTML tag name. Attribs is an array of
* // alternating upper-case attribute names, and attribute values. The
* // attribs array is reused by the parser. Param is the value passed to
* // the saxParser.
* startTag: function (name, attribs, param) { ... },
* endTag: function (name, param) { ... },
* pcdata: function (text, param) { ... },
* rcdata: function (text, param) { ... },
* cdata: function (text, param) { ... },
* startDoc: function (param) { ... },
* endDoc: function (param) { ... }
* }}
*
* @param {Object} handler a record containing event handlers.
* @return {function(string, Object)} A function that takes a chunk of HTML
* and a parameter. The parameter is passed on to the handler methods.
*/
function makeSaxParser(handler) {
// Accept quoted or unquoted keys (Closure compat)
var hcopy = {
cdata: handler.cdata || handler['cdata'],
comment: handler.comment || handler['comment'],
endDoc: handler.endDoc || handler['endDoc'],
endTag: handler.endTag || handler['endTag'],
pcdata: handler.pcdata || handler['pcdata'],
rcdata: handler.rcdata || handler['rcdata'],
startDoc: handler.startDoc || handler['startDoc'],
startTag: handler.startTag || handler['startTag']
};
return function(htmlText, param) {
return parse(htmlText, hcopy, param);
};
}
// Parsing strategy is to split input into parts that might be lexically
// meaningful (every ">" becomes a separate part), and then recombine
// parts if we discover they're in a different context.
// TODO(felix8a): Significant performance regressions from -legacy,
// tested on
// Chrome 18.0
// Firefox 11.0
// IE 6, 7, 8, 9
// Opera 11.61
// Safari 5.1.3
// Many of these are unusual patterns that are linearly slower and still
// pretty fast (eg 1ms to 5ms), so not necessarily worth fixing.
// TODO(felix8a): "<script> && && && ... <\/script>" is slower on all
// browsers. The hotspot is htmlSplit.
// TODO(felix8a): "<p title='>>>>...'><\/p>" is slower on all browsers.
// This is partly htmlSplit, but the hotspot is parseTagAndAttrs.
// TODO(felix8a): "<a><\/a><a><\/a>..." is slower on IE9.
// "<a>1<\/a><a>1<\/a>..." is faster, "<a><\/a>2<a><\/a>2..." is faster.
// TODO(felix8a): "<p<p<p..." is slower on IE[6-8]
var continuationMarker = {};
function parse(htmlText, handler, param) {
var m, p, tagName;
var parts = htmlSplit(htmlText);
var state = {
noMoreGT: false,
noMoreEndComments: false
};
parseCPS(handler, parts, 0, state, param);
}
function continuationMaker(h, parts, initial, state, param) {
return function () {
parseCPS(h, parts, initial, state, param);
};
}
function parseCPS(h, parts, initial, state, param) {
try {
if (h.startDoc && initial == 0) { h.startDoc(param); }
var m, p, tagName;
for (var pos = initial, end = parts.length; pos < end;) {
var current = parts[pos++];
var next = parts[pos];
switch (current) {
case '&':
if (ENTITY_RE_2.test(next)) {
if (h.pcdata) {
h.pcdata('&' + next, param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
pos++;
} else {
if (h.pcdata) { h.pcdata("&amp;", param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
}
break;
case '<\/':
if ((m = /^([-\w:]+)[^\'\"]*/.exec(next))) {
if (m[0].length === next.length && parts[pos + 1] === '>') {
// fast case, no attribute parsing needed
pos += 2;
tagName = m[1].toLowerCase();
if (h.endTag) {
h.endTag(tagName, param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
} else {
// slow case, need to parse attributes
// TODO(felix8a): do we really care about misparsing this?
pos = parseEndTag(
parts, pos, h, param, continuationMarker, state);
}
} else {
if (h.pcdata) {
h.pcdata('&lt;/', param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
}
break;
case '<':
if (m = /^([-\w:]+)\s*\/?/.exec(next)) {
if (m[0].length === next.length && parts[pos + 1] === '>') {
// fast case, no attribute parsing needed
pos += 2;
tagName = m[1].toLowerCase();
if (h.startTag) {
h.startTag(tagName, [], param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
// tags like <script> and <textarea> have special parsing
var eflags = html4.ELEMENTS[tagName];
if (eflags & EFLAGS_TEXT) {
var tag = { name: tagName, next: pos, eflags: eflags };
pos = parseText(
parts, tag, h, param, continuationMarker, state);
}
} else {
// slow case, need to parse attributes
pos = parseStartTag(
parts, pos, h, param, continuationMarker, state);
}
} else {
if (h.pcdata) {
h.pcdata('&lt;', param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
}
break;
case '<\!--':
// The pathological case is n copies of '<\!--' without '-->', and
// repeated failure to find '-->' is quadratic. We avoid that by
// remembering when search for '-->' fails.
if (!state.noMoreEndComments) {
// A comment <\!--x--> is split into three tokens:
// '<\!--', 'x--', '>'
// We want to find the next '>' token that has a preceding '--'.
// pos is at the 'x--'.
for (p = pos + 1; p < end; p++) {
if (parts[p] === '>' && /--$/.test(parts[p - 1])) { break; }
}
if (p < end) {
if (h.comment) {
var comment = parts.slice(pos, p).join('');
h.comment(
comment.substr(0, comment.length - 2), param,
continuationMarker,
continuationMaker(h, parts, p + 1, state, param));
}
pos = p + 1;
} else {
state.noMoreEndComments = true;
}
}
if (state.noMoreEndComments) {
if (h.pcdata) {
h.pcdata('&lt;!--', param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
}
break;
case '<\!':
if (!/^\w/.test(next)) {
if (h.pcdata) {
h.pcdata('&lt;!', param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
} else {
// similar to noMoreEndComment logic
if (!state.noMoreGT) {
for (p = pos + 1; p < end; p++) {
if (parts[p] === '>') { break; }
}
if (p < end) {
pos = p + 1;
} else {
state.noMoreGT = true;
}
}
if (state.noMoreGT) {
if (h.pcdata) {
h.pcdata('&lt;!', param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
}
}
break;
case '<?':
// similar to noMoreEndComment logic
if (!state.noMoreGT) {
for (p = pos + 1; p < end; p++) {
if (parts[p] === '>') { break; }
}
if (p < end) {
pos = p + 1;
} else {
state.noMoreGT = true;
}
}
if (state.noMoreGT) {
if (h.pcdata) {
h.pcdata('&lt;?', param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
}
break;
case '>':
if (h.pcdata) {
h.pcdata("&gt;", param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
break;
case '':
break;
default:
if (h.pcdata) {
h.pcdata(current, param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
break;
}
}
if (h.endDoc) { h.endDoc(param); }
} catch (e) {
if (e !== continuationMarker) { throw e; }
}
}
// Split str into parts for the html parser.
function htmlSplit(str) {
// can't hoist this out of the function because of the re.exec loop.
var re = /(<\/|<\!--|<[!?]|[&<>])/g;
str += '';
if (splitWillCapture) {
return str.split(re);
} else {
var parts = [];
var lastPos = 0;
var m;
while ((m = re.exec(str)) !== null) {
parts.push(str.substring(lastPos, m.index));
parts.push(m[0]);
lastPos = m.index + m[0].length;
}
parts.push(str.substring(lastPos));
return parts;
}
}
function parseEndTag(parts, pos, h, param, continuationMarker, state) {
var tag = parseTagAndAttrs(parts, pos);
// drop unclosed tags
if (!tag) { return parts.length; }
if (h.endTag) {
h.endTag(tag.name, param, continuationMarker,
continuationMaker(h, parts, pos, state, param));
}
return tag.next;
}
function parseStartTag(parts, pos, h, param, continuationMarker, state) {
var tag = parseTagAndAttrs(parts, pos);
// drop unclosed tags
if (!tag) { return parts.length; }
if (h.startTag) {
h.startTag(tag.name, tag.attrs, param, continuationMarker,
continuationMaker(h, parts, tag.next, state, param));
}
// tags like <script> and <textarea> have special parsing
if (tag.eflags & EFLAGS_TEXT) {
return parseText(parts, tag, h, param, continuationMarker, state);
} else {
return tag.next;
}
}
var endTagRe = {};
// Tags like <script> and <textarea> are flagged as CDATA or RCDATA,
// which means everything is text until we see the correct closing tag.
function parseText(parts, tag, h, param, continuationMarker, state) {
var end = parts.length;
if (!endTagRe.hasOwnProperty(tag.name)) {
endTagRe[tag.name] = new RegExp('^' + tag.name + '(?:[\\s\\/]|$)', 'i');
}
var re = endTagRe[tag.name];
var first = tag.next;
var p = tag.next + 1;
for (; p < end; p++) {
if (parts[p - 1] === '<\/' && re.test(parts[p])) { break; }
}
if (p < end) { p -= 1; }
var buf = parts.slice(first, p).join('');
if (tag.eflags & html4.eflags['CDATA']) {
if (h.cdata) {
h.cdata(buf, param, continuationMarker,
continuationMaker(h, parts, p, state, param));
}
} else if (tag.eflags & html4.eflags['RCDATA']) {
if (h.rcdata) {
h.rcdata(normalizeRCData(buf), param, continuationMarker,
continuationMaker(h, parts, p, state, param));
}
} else {
throw new Error('bug');
}
return p;
}
// at this point, parts[pos-1] is either "<" or "<\/".
function parseTagAndAttrs(parts, pos) {
var m = /^([-\w:]+)/.exec(parts[pos]);
var tag = {};
tag.name = m[1].toLowerCase();
tag.eflags = html4.ELEMENTS[tag.name];
var buf = parts[pos].substr(m[0].length);
// Find the next '>'. We optimistically assume this '>' is not in a
// quoted context, and further down we fix things up if it turns out to
// be quoted.
var p = pos + 1;
var end = parts.length;
for (; p < end; p++) {
if (parts[p] === '>') { break; }
buf += parts[p];
}
if (end <= p) { return void 0; }
var attrs = [];
while (buf !== '') {
m = ATTR_RE.exec(buf);
if (!m) {
// No attribute found: skip garbage
buf = buf.replace(/^[\s\S][^a-z\s]*/, '');
} else if ((m[4] && !m[5]) || (m[6] && !m[7])) {
// Unterminated quote: slurp to the next unquoted '>'
var quote = m[4] || m[6];
var sawQuote = false;
var abuf = [buf, parts[p++]];
for (; p < end; p++) {
if (sawQuote) {
if (parts[p] === '>') { break; }
} else if (0 <= parts[p].indexOf(quote)) {
sawQuote = true;
}
abuf.push(parts[p]);
}
// Slurp failed: lose the garbage
if (end <= p) { break; }
// Otherwise retry attribute parsing
buf = abuf.join('');
continue;
} else {
// We have an attribute
var aName = m[1].toLowerCase();
var aValue = m[2] ? decodeValue(m[3]) : '';
attrs.push(aName, aValue);
buf = buf.substr(m[0].length);
}
}
tag.attrs = attrs;
tag.next = p + 1;
return tag;
}
function decodeValue(v) {
var q = v.charCodeAt(0);
if (q === 0x22 || q === 0x27) { // " or '
v = v.substr(1, v.length - 2);
}
return unescapeEntities(stripNULs(v));
}
/**
* Returns a function that strips unsafe tags and attributes from html.
* @param {function(string, Array.<string>): ?Array.<string>} tagPolicy
* A function that takes (tagName, attribs[]), where tagName is a key in
* html4.ELEMENTS and attribs is an array of alternating attribute names
* and values. It should return a record (as follows), or null to delete
* the element. It's okay for tagPolicy to modify the attribs array,
* but the same array is reused, so it should not be held between calls.
* Record keys:
* attribs: (required) Sanitized attributes array.
* tagName: Replacement tag name.
* @return {function(string, Array)} A function that sanitizes a string of
* HTML and appends result strings to the second argument, an array.
*/
function makeHtmlSanitizer(tagPolicy) {
var stack;
var ignoring;
var emit = function (text, out) {
if (!ignoring) { out.push(text); }
};
return makeSaxParser({
'startDoc': function(_) {
stack = [];
ignoring = false;
},
'startTag': function(tagNameOrig, attribs, out) {
if (ignoring) { return; }
if (!html4.ELEMENTS.hasOwnProperty(tagNameOrig)) { return; }
var eflagsOrig = html4.ELEMENTS[tagNameOrig];
if (eflagsOrig & html4.eflags['FOLDABLE']) {
return;
}
var decision = tagPolicy(tagNameOrig, attribs);
if (!decision) {
ignoring = !(eflagsOrig & html4.eflags['EMPTY']);
return;
} else if (typeof decision !== 'object') {
throw new Error('tagPolicy did not return object (old API?)');
}
if ('attribs' in decision) {
attribs = decision['attribs'];
} else {
throw new Error('tagPolicy gave no attribs');
}
var eflagsRep;
var tagNameRep;
if ('tagName' in decision) {
tagNameRep = decision['tagName'];
eflagsRep = html4.ELEMENTS[tagNameRep];
} else {
tagNameRep = tagNameOrig;
eflagsRep = eflagsOrig;
}
// TODO(mikesamuel): relying on tagPolicy not to insert unsafe
// attribute names.
// If this is an optional-end-tag element and either this element or its
// previous like sibling was rewritten, then insert a close tag to
// preserve structure.
if (eflagsOrig & html4.eflags['OPTIONAL_ENDTAG']) {
var onStack = stack[stack.length - 1];
if (onStack && onStack.orig === tagNameOrig &&
(onStack.rep !== tagNameRep || tagNameOrig !== tagNameRep)) {
out.push('<\/', onStack.rep, '>');
}
}
if (!(eflagsOrig & html4.eflags['EMPTY'])) {
stack.push({orig: tagNameOrig, rep: tagNameRep});
}
out.push('<', tagNameRep);
for (var i = 0, n = attribs.length; i < n; i += 2) {
var attribName = attribs[i],
value = attribs[i + 1];
if (value !== null && value !== void 0) {
out.push(' ', attribName, '="', escapeAttrib(value), '"');
}
}
out.push('>');
if ((eflagsOrig & html4.eflags['EMPTY'])
&& !(eflagsRep & html4.eflags['EMPTY'])) {
// replacement is non-empty, synthesize end tag
out.push('<\/', tagNameRep, '>');
}
},
'endTag': function(tagName, out) {
if (ignoring) {
ignoring = false;
return;
}
if (!html4.ELEMENTS.hasOwnProperty(tagName)) { return; }
var eflags = html4.ELEMENTS[tagName];
if (!(eflags & (html4.eflags['EMPTY'] | html4.eflags['FOLDABLE']))) {
var index;
if (eflags & html4.eflags['OPTIONAL_ENDTAG']) {
for (index = stack.length; --index >= 0;) {
var stackElOrigTag = stack[index].orig;
if (stackElOrigTag === tagName) { break; }
if (!(html4.ELEMENTS[stackElOrigTag] &
html4.eflags['OPTIONAL_ENDTAG'])) {
// Don't pop non optional end tags looking for a match.
return;
}
}
} else {
for (index = stack.length; --index >= 0;) {
if (stack[index].orig === tagName) { break; }
}
}
if (index < 0) { return; } // Not opened.
for (var i = stack.length; --i > index;) {
var stackElRepTag = stack[i].rep;
if (!(html4.ELEMENTS[stackElRepTag] &
html4.eflags['OPTIONAL_ENDTAG'])) {
out.push('<\/', stackElRepTag, '>');
}
}
if (index < stack.length) {
tagName = stack[index].rep;
}
stack.length = index;
out.push('<\/', tagName, '>');
}
},
'pcdata': emit,
'rcdata': emit,
'cdata': emit,
'endDoc': function(out) {
for (; stack.length; stack.length--) {
out.push('<\/', stack[stack.length - 1].rep, '>');
}
}
});
}
var ALLOWED_URI_SCHEMES = /^(?:https?|mailto)$/i;
function safeUri(uri, effect, ltype, hints, naiveUriRewriter) {
if (!naiveUriRewriter) { return null; }
try {
var parsed = URI.parse('' + uri);
if (parsed) {
if (!parsed.hasScheme() ||
ALLOWED_URI_SCHEMES.test(parsed.getScheme())) {
var safe = naiveUriRewriter(parsed, effect, ltype, hints);
return safe ? safe.toString() : null;
}
}
} catch (e) {
return null;
}
return null;
}
function log(logger, tagName, attribName, oldValue, newValue) {
if (!attribName) {
logger(tagName + " removed", {
change: "removed",
tagName: tagName
});
}
if (oldValue !== newValue) {
var changed = "changed";
if (oldValue && !newValue) {
changed = "removed";
} else if (!oldValue && newValue) {
changed = "added";
}
logger(tagName + "." + attribName + " " + changed, {
change: changed,
tagName: tagName,
attribName: attribName,
oldValue: oldValue,
newValue: newValue
});
}
}
function lookupAttribute(map, tagName, attribName) {
var attribKey;
attribKey = tagName + '::' + attribName;
if (map.hasOwnProperty(attribKey)) {
return map[attribKey];
}
attribKey = '*::' + attribName;
if (map.hasOwnProperty(attribKey)) {
return map[attribKey];
}
return void 0;
}
function getAttributeType(tagName, attribName) {
return lookupAttribute(html4.ATTRIBS, tagName, attribName);
}
function getLoaderType(tagName, attribName) {
return lookupAttribute(html4.LOADERTYPES, tagName, attribName);
}
function getUriEffect(tagName, attribName) {
return lookupAttribute(html4.URIEFFECTS, tagName, attribName);
}
/**
* Sanitizes attributes on an HTML tag.
* @param {string} tagName An HTML tag name in lowercase.
* @param {Array.<?string>} attribs An array of alternating names and values.
* @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
* apply to URI attributes; it can return a new string value, or null to
* delete the attribute. If unspecified, URI attributes are deleted.
* @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
* to attributes containing HTML names, element IDs, and space-separated
* lists of classes; it can return a new string value, or null to delete
* the attribute. If unspecified, these attributes are kept unchanged.
* @return {Array.<?string>} The sanitized attributes as a list of alternating
* names and values, where a null value means to omit the attribute.
*/
2013-11-29 12:08:53 -05:00
function sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
// TODO(felix8a): it's obnoxious that domado duplicates much of this
// TODO(felix8a): maybe consistently enforce constraints like target=
for (var i = 0; i < attribs.length; i += 2) {
var attribName = attribs[i];
var value = attribs[i + 1];
var oldValue = value;
var atype = null, attribKey;
if ((attribKey = tagName + '::' + attribName,
html4.ATTRIBS.hasOwnProperty(attribKey)) ||
(attribKey = '*::' + attribName,
html4.ATTRIBS.hasOwnProperty(attribKey))) {
atype = html4.ATTRIBS[attribKey];
}
if (atype !== null) {
switch (atype) {
case html4.atype['NONE']: break;
case html4.atype['SCRIPT']:
value = null;
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
case html4.atype['STYLE']:
if ('undefined' === typeof parseCssDeclarations) {
value = null;
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
}
var sanitizedDeclarations = [];
parseCssDeclarations(
value,
{
'declaration': function (property, tokens) {
var normProp = property.toLowerCase();
sanitizeCssProperty(
normProp, tokens,
opt_naiveUriRewriter
? function (url) {
return safeUri(
url, html4.ueffects.SAME_DOCUMENT,
html4.ltypes.SANDBOXED,
{
"TYPE": "CSS",
"CSS_PROP": normProp
}, opt_naiveUriRewriter);
}
: null);
if (tokens.length) {
sanitizedDeclarations.push(
normProp + ': ' + tokens.join(' '));
}
}
});
value = sanitizedDeclarations.length > 0 ?
sanitizedDeclarations.join(' ; ') : null;
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
case html4.atype['ID']:
case html4.atype['IDREF']:
case html4.atype['IDREFS']:
case html4.atype['GLOBAL_NAME']:
case html4.atype['LOCAL_NAME']:
case html4.atype['CLASSES']:
value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
case html4.atype['URI']:
value = safeUri(value,
getUriEffect(tagName, attribName),
getLoaderType(tagName, attribName),
{
"TYPE": "MARKUP",
"XML_ATTR": attribName,
"XML_TAG": tagName
}, opt_naiveUriRewriter);
2013-11-29 12:08:53 -05:00
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
case html4.atype['URI_FRAGMENT']:
if (value && '#' === value.charAt(0)) {
value = value.substring(1); // remove the leading '#'
value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
if (value !== null && value !== void 0) {
value = '#' + value; // restore the leading '#'
}
} else {
value = null;
}
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
default:
value = null;
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
break;
}
} else {
value = null;
if (opt_logger) {
log(opt_logger, tagName, attribName, oldValue, value);
}
}
attribs[i + 1] = value;
}
return attribs;
}
/**
* Creates a tag policy that omits all tags marked UNSAFE in html4-defs.js
* and applies the default attribute sanitizer with the supplied policy for
* URI attributes and NMTOKEN attributes.
* @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
* apply to URI attributes. If not given, URI attributes are deleted.
* @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
* to attributes containing HTML names, element IDs, and space-separated
* lists of classes. If not given, such attributes are left unchanged.
* @return {function(string, Array.<?string>)} A tagPolicy suitable for
* passing to html.sanitize.
*/
2013-11-29 12:08:53 -05:00
function makeTagPolicy(opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
return function(tagName, attribs) {
if (!(html4.ELEMENTS[tagName] & html4.eflags['UNSAFE'])) {
2013-11-29 12:08:53 -05:00
var sanitizedAttribs = sanitizeAttribs(tagName, attribs, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
var requiredAttributes = html4.REQUIREDATTRIBUTES[tagName];
if (requiredAttributes && missRequiredAttributes(sanitizedAttribs, requiredAttributes)) { return }
return { 'attribs': sanitizedAttribs };
} else {
if (opt_logger) {
log(opt_logger, tagName, undefined, undefined, undefined);
}
}
};
}
2013-11-29 12:08:53 -05:00
function missRequiredAttributes(sanitizedAttributes, requiredAttributes) {
var requiredAttributesWithValueCount = 0;
for (var i = 0, length = sanitizedAttributes.length; i < length; i += 2) {
var name = sanitizedAttributes[i];
var value = sanitizedAttributes[i + 1];
if (requiredAttributes.indexOf(name) > -1 && value && value.length > 0) { requiredAttributesWithValueCount++; }
}
return requiredAttributesWithValueCount != requiredAttributes.length;
}
/**
* Sanitizes HTML tags and attributes according to a given policy.
* @param {string} inputHtml The HTML to sanitize.
* @param {function(string, Array.<?string>)} tagPolicy A function that
* decides which tags to accept and sanitizes their attributes (see
* makeHtmlSanitizer above for details).
* @return {string} The sanitized HTML.
*/
function sanitizeWithPolicy(inputHtml, tagPolicy) {
var outputArray = [];
makeHtmlSanitizer(tagPolicy)(inputHtml, outputArray);
return outputArray.join('');
}
/**
* Strips unsafe tags and attributes from HTML.
* @param {string} inputHtml The HTML to sanitize.
* @param {?function(?string): ?string} opt_naiveUriRewriter A transform to
* apply to URI attributes. If not given, URI attributes are deleted.
* @param {function(?string): ?string} opt_nmTokenPolicy A transform to apply
* to attributes containing HTML names, element IDs, and space-separated
* lists of classes. If not given, such attributes are left unchanged.
*/
2013-11-29 12:08:53 -05:00
function sanitize(inputHtml, opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger) {
var tagPolicy = makeTagPolicy(opt_naiveUriRewriter, opt_nmTokenPolicy, opt_logger);
return sanitizeWithPolicy(inputHtml, tagPolicy);
}
// Export both quoted and unquoted names for Closure linkage.
var html = {};
html.escapeAttrib = html['escapeAttrib'] = escapeAttrib;
html.makeHtmlSanitizer = html['makeHtmlSanitizer'] = makeHtmlSanitizer;
html.makeSaxParser = html['makeSaxParser'] = makeSaxParser;
html.makeTagPolicy = html['makeTagPolicy'] = makeTagPolicy;
html.normalizeRCData = html['normalizeRCData'] = normalizeRCData;
html.sanitize = html['sanitize'] = sanitize;
html.sanitizeAttribs = html['sanitizeAttribs'] = sanitizeAttribs;
html.sanitizeWithPolicy = html['sanitizeWithPolicy'] = sanitizeWithPolicy;
html.unescapeEntities = html['unescapeEntities'] = unescapeEntities;
return html;
})(html4);
var html_sanitize = html['sanitize'];
// Exports for Closure compiler. Note this file is also cajoled
// for domado and run in an environment without 'window'
if (typeof window !== 'undefined') {
window['html'] = html;
window['html_sanitize'] = html_sanitize;
}