import {getDOM, DomAdapter} from '../dom/dom_adapter'; import {assertionsEnabled} from '../../src/facade/lang'; import {sanitizeUrl} from './url_sanitizer'; /** A
element that can be safely used to parse untrusted HTML. Lazily initialized below. */ let inertElement: HTMLElement = null; /** Lazily initialized to make sure the DOM adapter gets set before use. */ let DOM: DomAdapter = null; /** Returns an HTML element that is guaranteed to not execute code when creating elements in it. */ function getInertElement() { if (inertElement) return inertElement; DOM = getDOM(); // Prefer using element if supported. let templateEl = DOM.createElement('template'); if ('content' in templateEl) return templateEl; let doc = DOM.createHtmlDocument(); inertElement = DOM.querySelector(doc, 'body'); if (inertElement == null) { // usually there should be only one body element in the document, but IE doesn't have any, so we // need to create one. let html = DOM.createElement('html', doc); inertElement = DOM.createElement('body', doc); DOM.appendChild(html, inertElement); DOM.appendChild(doc, html); } return inertElement; } function tagSet(tags: string): {[k: string]: boolean} { let res: {[k: string]: boolean} = {}; for (let t of tags.split(',')) res[t.toLowerCase()] = true; return res; } function merge(...sets: { [k: string]: boolean }[]): {[k: string]: boolean} { let res: {[k: string]: boolean} = {}; for (let s of sets) { for (let v in s) { if (s.hasOwnProperty(v)) res[v] = true; } } return res; } // Good source of info about elements and attributes // http://dev.w3.org/html5/spec/Overview.html#semantics // http://simon.html5.org/html-elements // Safe Void Elements - HTML5 // http://dev.w3.org/html5/spec/Overview.html#void-elements const VOID_ELEMENTS = tagSet('area,br,col,hr,img,wbr'); // Elements that you can, intentionally, leave open (and which close themselves) // http://dev.w3.org/html5/spec/Overview.html#optional-tags const OPTIONAL_END_TAG_BLOCK_ELEMENTS = tagSet('colgroup,dd,dt,li,p,tbody,td,tfoot,th,thead,tr'); const OPTIONAL_END_TAG_INLINE_ELEMENTS = tagSet('rp,rt'); const OPTIONAL_END_TAG_ELEMENTS = merge(OPTIONAL_END_TAG_INLINE_ELEMENTS, OPTIONAL_END_TAG_BLOCK_ELEMENTS); // Safe Block Elements - HTML5 const BLOCK_ELEMENTS = merge( OPTIONAL_END_TAG_BLOCK_ELEMENTS, tagSet( 'address,article,' + 'aside,blockquote,caption,center,del,dir,div,dl,figure,figcaption,footer,h1,h2,h3,h4,h5,' + 'h6,header,hgroup,hr,ins,map,menu,nav,ol,pre,section,table,ul')); // Inline Elements - HTML5 const INLINE_ELEMENTS = merge( OPTIONAL_END_TAG_INLINE_ELEMENTS, tagSet('a,abbr,acronym,b,' + 'bdi,bdo,big,br,cite,code,del,dfn,em,font,i,img,ins,kbd,label,map,mark,q,ruby,rp,rt,s,' + 'samp,small,span,strike,strong,sub,sup,time,tt,u,var')); const VALID_ELEMENTS = merge(VOID_ELEMENTS, BLOCK_ELEMENTS, INLINE_ELEMENTS, OPTIONAL_END_TAG_ELEMENTS); // Attributes that have href and hence need to be sanitized const URI_ATTRS = tagSet('background,cite,href,longdesc,src,xlink:href'); const HTML_ATTRS = tagSet('abbr,align,alt,axis,bgcolor,border,cellpadding,cellspacing,class,clear,' + 'color,cols,colspan,compact,coords,dir,face,headers,height,hreflang,hspace,' + 'ismap,lang,language,nohref,nowrap,rel,rev,rows,rowspan,rules,' + 'scope,scrolling,shape,size,span,start,summary,tabindex,target,title,type,' + 'valign,value,vspace,width'); const VALID_ATTRS = merge(URI_ATTRS, HTML_ATTRS); /** * SanitizingHtmlSerializer serializes a DOM fragment, stripping out any unsafe elements and unsafe * attributes. */ class SanitizingHtmlSerializer { private buf: string[] = []; sanitizeChildren(el: Element): string { let current: Node = el.firstChild; while (current) { if (DOM.isElementNode(current)) { this.startElement(current); } else if (DOM.isTextNode(current)) { this.chars(DOM.nodeValue(current)); } if (DOM.firstChild(current)) { current = DOM.firstChild(current); continue; } while (current) { // Leaving the element. Walk up and to the right, closing tags as we go. if (DOM.isElementNode(current)) { this.endElement(DOM.nodeName(current).toLowerCase()); } if (DOM.nextSibling(current)) { current = DOM.nextSibling(current); break; } current = DOM.parentElement(current); } } return this.buf.join(''); } private startElement(element: any) { let tagName = DOM.nodeName(element).toLowerCase(); tagName = tagName.toLowerCase(); if (VALID_ELEMENTS.hasOwnProperty(tagName)) { this.buf.push('<'); this.buf.push(tagName); DOM.attributeMap(element).forEach((value: string, attrName: string) => { let lower = attrName.toLowerCase(); if (!VALID_ATTRS.hasOwnProperty(lower)) return; // TODO(martinprobst): Special case image URIs for data:image/... if (URI_ATTRS[lower]) value = sanitizeUrl(value); this.buf.push(' '); this.buf.push(attrName); this.buf.push('="'); this.buf.push(encodeEntities(value)); this.buf.push('"'); }); this.buf.push('>'); } } private endElement(tagName: string) { tagName = tagName.toLowerCase(); if (VALID_ELEMENTS.hasOwnProperty(tagName) && !VOID_ELEMENTS.hasOwnProperty(tagName)) { this.buf.push(''); this.buf.push(tagName); this.buf.push('>'); } } private chars(chars) { this.buf.push(encodeEntities(chars)); } } // Regular Expressions for parsing tags and attributes const SURROGATE_PAIR_REGEXP = /[\uD800-\uDBFF][\uDC00-\uDFFF]/g; // ! to ~ is the ASCII range. const NON_ALPHANUMERIC_REGEXP = /([^\#-~ |!])/g; /** * Escapes all potentially dangerous characters, so that the * resulting string can be safely inserted into attribute or * element text. * @param value * @returns {string} escaped text */ function encodeEntities(value) { return value.replace(/&/g, '&') .replace(SURROGATE_PAIR_REGEXP, function(match) { let hi = match.charCodeAt(0); let low = match.charCodeAt(1); return '' + (((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000) + ';'; }) .replace(NON_ALPHANUMERIC_REGEXP, function(match) { return '' + match.charCodeAt(0) + ';'; }) .replace(//g, '>'); } /** * When IE9-11 comes across an unknown namespaced attribute e.g. 'xlink:foo' it adds 'xmlns:ns1' * attribute to declare ns1 namespace and prefixes the attribute with 'ns1' (e.g. 'ns1:xlink:foo'). * * This is undesirable since we don't want to allow any of these custom attributes. This method * strips them all. */ function stripCustomNsAttrs(el: any) { DOM.attributeMap(el).forEach((_, attrName) => { if (attrName === 'xmlns:ns1' || attrName.indexOf('ns1:') === 0) { DOM.removeAttribute(el, attrName); } }); for (let n of DOM.childNodesAsList(el)) { if (DOM.isElementNode(n)) stripCustomNsAttrs(n); } } /** * Sanitizes the given unsafe, untrusted HTML fragment, and returns HTML text that is safe to add to * the DOM in a browser environment. */ export function sanitizeHtml(unsafeHtml: string): string { try { let containerEl = getInertElement(); // Make sure unsafeHtml is actually a string (TypeScript types are not enforced at runtime). unsafeHtml = unsafeHtml ? String(unsafeHtml) : ''; // mXSS protection. Repeatedly parse the document to make sure it stabilizes, so that a browser // trying to auto-correct incorrect HTML cannot cause formerly inert HTML to become dangerous. let mXSSAttempts = 5; let parsedHtml = unsafeHtml; do { if (mXSSAttempts === 0) { throw new Error('Failed to sanitize html because the input is unstable'); } mXSSAttempts--; unsafeHtml = parsedHtml; DOM.setInnerHTML(containerEl, unsafeHtml); if ((DOM.defaultDoc() as any).documentMode) { // strip custom-namespaced attributes on IE<=11 stripCustomNsAttrs(containerEl); } parsedHtml = DOM.getInnerHTML(containerEl); } while (unsafeHtml !== parsedHtml); let sanitizer = new SanitizingHtmlSerializer(); let safeHtml = sanitizer.sanitizeChildren(DOM.getTemplateContent(containerEl) || containerEl); // Clear out the body element. let parent = DOM.getTemplateContent(containerEl) || containerEl; for (let child of DOM.childNodesAsList(parent)) { DOM.removeChild(parent, child); } if (assertionsEnabled() && safeHtml !== unsafeHtml) { DOM.log('WARNING: some HTML contents were removed during sanitization.'); } return safeHtml; } catch (e) { // In case anything goes wrong, clear out inertElement to reset the entire DOM structure. inertElement = null; throw e; } }