FIX: Change the approach to sanitization. Includes a more detailed API

for allowing classes and attributes for only certain tag names.
2014-07-03 16:54:56 -04:00 · 2014-07-03 16:54:56 -04:00 · fc1ce96dbb
parent cfeae239a8
commit fc1ce96dbb
10 changed files with 128 additions and 58 deletions
--- a/app/assets/javascripts/defer/html-sanitizer-bundle.js
+++ b/app/assets/javascripts/defer/html-sanitizer-bundle.js
@ -2060,6 +2060,10 @@ var html = (function(html4) {
           html4.ATTRIBS.hasOwnProperty(attribKey))) {
        atype = html4.ATTRIBS[attribKey];
      }
+
+      // Discourse modification: give us more flexibility with whitelists
+      if (opt_nmTokenPolicy && opt_nmTokenPolicy(tagName, attribName, value)) { continue; }
+
      if (atype !== null) {
        switch (atype) {
          case html4.atype['NONE']: break;
@ -2108,17 +2112,6 @@ var html = (function(html4) {
              log(opt_logger, tagName, attribName, oldValue, value);
            }
            break;
-          case html4.atype['ID']:
-          case html4.atype['IDREF']:
-          case html4.atype['IDREFS']:
-          case html4.atype['GLOBAL_NAME']:
-          case html4.atype['LOCAL_NAME']:
-          case html4.atype['CLASSES']:
-            value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
-            if (opt_logger) {
-              log(opt_logger, tagName, attribName, oldValue, value);
-            }
-            break;
          case html4.atype['URI']:
            value = safeUri(value,
              getUriEffect(tagName, attribName),
@ -2135,7 +2128,6 @@ var html = (function(html4) {
          case html4.atype['URI_FRAGMENT']:
            if (value && '#' === value.charAt(0)) {
              value = value.substring(1);  // remove the leading '#'
-              value = opt_nmTokenPolicy ? opt_nmTokenPolicy(value) : value;
              if (value !== null && value !== void 0) {
                value = '#' + value;  // restore the leading '#'
              }
--- a/app/assets/javascripts/discourse/dialects/autolink_dialect.js
+++ b/app/assets/javascripts/discourse/dialects/autolink_dialect.js
@ -4,7 +4,7 @@
 **/
 var urlReplacerArgs = {
  matcher: /^((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/gm,
-  spaceBoundary: true,
+  spaceOrTagBoundary: true,

  emitter: function(matches) {
    var url = matches[1],
--- a/app/assets/javascripts/discourse/dialects/dialect.js
+++ b/app/assets/javascripts/discourse/dialects/dialect.js
@ -42,10 +42,6 @@ function processTextNodes(node, event, emitter) {
  for (var j=1; j<node.length; j++) {
    var textContent = node[j];
    if (typeof textContent === "string") {
-      if (dialect.options.sanitize && !skipSanitize[textContent]) {
-        textContent = Discourse.Markdown.sanitize(textContent);
-      }
-
      var result = emitter(textContent, event);
      if (result) {
        if (result instanceof Array) {
@ -118,13 +114,14 @@ function parseTree(tree, path, insideCounts) {
  @returns {Boolean} whether there is an invalid word boundary
 **/
 function invalidBoundary(args, prev) {
-  if (!args.wordBoundary && !args.spaceBoundary) { return false; }
+  if (!(args.wordBoundary || args.spaceBoundary || args.spaceOrTagBoundary)) { return false; }

  var last = prev[prev.length - 1];
  if (typeof last !== "string") { return false; }

  if (args.wordBoundary && (last.match(/(\w|\/)$/))) { return true; }
  if (args.spaceBoundary && (!last.match(/\s$/))) { return true; }
+  if (args.spaceOrTagBoundary && (!last.match(/(\s|\>)$/))) { return true; }
 }

 /**
@ -147,9 +144,19 @@ Discourse.Dialect = {
  cook: function(text, opts) {
    if (!initialized) { initializeDialects(); }
    dialect.options = opts;
-    var tree = parser.toHTMLTree(text, 'Discourse');
+    var tree = parser.toHTMLTree(text, 'Discourse'),
+        result = parser.renderJsonML(parseTree(tree));

-    return parser.renderJsonML(parseTree(tree));
+    // This feature is largely for MDTest. We prefer to strip comments
+    // in Discourse
+
+    if (opts.sanitize) {
+      result = Discourse.Markdown.sanitize(result);
+    } else if (opts.sanitizerFunction) {
+      result = opts.sanitizerFunction(result);
+    }
+
+    return result.trim();
  },

  /**
--- a/app/assets/javascripts/discourse/lib/markdown.js
+++ b/app/assets/javascripts/discourse/lib/markdown.js
@ -7,10 +7,51 @@
  @namespace Discourse
  @module Discourse
 **/
+var _validClasses = {},
+    _validIframes = [],
+    _validTags = {};
+
+function validateAttribute(tagName, attribName, value) {
+  var tag = _validTags[tagName];
+
+  // Handle classes
+  if (attribName === "class") {
+    if (_validClasses[value]) { return value; }
+
+    if (tag) {
+      var classes = tag['class'];
+      if (classes && (classes.indexOf(value) !== -1 || classes.indexOf('*') !== -1)) {
+        return value;
+      }
+    }
+  } else if (attribName.indexOf('data-') === 0) {
+    // data-* attributes
+    if (tag) {
+      var allowed = tag[attribName] || tag['data-*'];
+      if (allowed === value || allowed.indexOf('*') !== -1) { return value; }
+    }
+  }
+
+  if (tag) {
+    var attrs = tag[attribName];
+    if (attrs && (attrs.indexOf(value) !== -1 || attrs.indexOf('*') !== -1)) { return value; }
+  }
+}
+
 Discourse.Markdown = {

-  validClasses: {},
-  validIframes: [],
+  /**
+    Whitelist class for only a certain tag
+
+    @param {String} tagName to whitelist
+    @param {String} attribName to whitelist
+    @param {String} value to whitelist
+  **/
+  whiteListTag: function(tagName, attribName, value) {
+    _validTags[tagName] = _validTags[tagName] || {};
+    _validTags[tagName][attribName] = _validTags[tagName][attribName] || [];
+    _validTags[tagName][attribName].push(value || '*');
+  },

  /**
    Whitelists more classes for sanitization.
@ -19,10 +60,8 @@ Discourse.Markdown = {
    @method whiteListClass
  **/
  whiteListClass: function() {
-    var args = Array.prototype.slice.call(arguments),
-        validClasses = Discourse.Markdown.validClasses;
-
-    args.forEach(function (a) { validClasses[a] = true; });
+    var args = Array.prototype.slice.call(arguments);
+    args.forEach(function (a) { _validClasses[a] = true; });
  },

  /**
@ -32,7 +71,7 @@ Discourse.Markdown = {
    @param {Regexp} regexp The regexp to whitelist.
  **/
  whiteListIframe: function(regexp) {
-    Discourse.Markdown.validIframes.push(regexp);
+    _validIframes.push(regexp);
  },

  /**
@ -124,8 +163,8 @@ Discourse.Markdown = {

    // whitelist some iframe only
    if (hints && hints.XML_TAG === "iframe" && hints.XML_ATTR === "src") {
-      for (var i = 0, length = Discourse.Markdown.validIframes.length; i < length; i++) {
-        if(Discourse.Markdown.validIframes[i].test(url)) { return url; }
+      for (var i = 0, length = _validIframes.length; i < length; i++) {
+        if(_validIframes[i].test(url)) { return url; }
      }
      return;
    }
@ -144,12 +183,10 @@ Discourse.Markdown = {
    Checks to see if a name, class or id is allowed in the cooked content

    @method nameIdClassAllowed
-    @param {String} val The name, class or id to check
-    @return {String} val the transformed name class or id
+    @param {String} tagName to check
+    @param {String} attribName to check
+    @param {String} value to check
  **/
-  nameIdClassAllowed: function(val) {
-    if (Discourse.Markdown.validClasses[val]) { return val; }
-  },


  /**
@ -161,8 +198,11 @@ Discourse.Markdown = {
  **/
  sanitize: function(text) {
    if (!window.html_sanitize || !text) return "";
-    text = text.replace(/<([^A-Za-z\/]|$)/g, "&lt;$1");
-    return window.html_sanitize(text, Discourse.Markdown.urlAllowed, Discourse.Markdown.nameIdClassAllowed);
+
+    // Allow things like <3 and <_<
+    text = text.replace(/<([^A-Za-z\/\!]|$)/g, "&lt;$1");
+
+    return window.html_sanitize(text, Discourse.Markdown.urlAllowed, validateAttribute);
  },

  /**
@ -185,5 +225,14 @@ Discourse.Markdown = {
 };
 RSVP.EventTarget.mixin(Discourse.Markdown);

-Discourse.Markdown.whiteListClass("attachment");
+Discourse.Markdown.whiteListTag('a', 'class', 'attachment');
+Discourse.Markdown.whiteListTag('a', 'target', '_blank');
+Discourse.Markdown.whiteListTag('a', 'class', 'onebox');
+Discourse.Markdown.whiteListTag('a', 'class', 'mention');
+Discourse.Markdown.whiteListTag('div', 'class', 'title');
+Discourse.Markdown.whiteListTag('div', 'class', 'quote-controls');
+Discourse.Markdown.whiteListTag('code', 'class', '*');
+Discourse.Markdown.whiteListTag('span', 'class', 'mention');
+Discourse.Markdown.whiteListTag('aside', 'class', 'quote');
+Discourse.Markdown.whiteListTag('aside', 'data-*');
 Discourse.Markdown.whiteListIframe(/^(https?:)?\/\/www\.google\.com\/maps\/embed\?.+/i);
--- a/plugins/emoji/assets/javascripts/emoji.js.erb
+++ b/plugins/emoji/assets/javascripts/emoji.js.erb
@ -156,5 +156,5 @@
    });
  }

-  Discourse.Markdown.whiteListClass("emoji");
+  Discourse.Markdown.whiteListTag('img', 'class', 'emoji');
 }).call(this);
--- a/spec/components/pretty_text_spec.rb
+++ b/spec/components/pretty_text_spec.rb
@ -258,20 +258,11 @@ describe PrettyText do
      PrettyText.cook("**a*_b**").should match_html "<p><strong>a*_b</strong></p>"
    end

-    pending "small links" do
-      PrettyText.cook("<small>\nhttp://a.com\n<small>").should match_html "<p><small><br><a href=\"http://a.com\" rel=\"nofollow\">http://a.com</a><br></small></p>"
-    end
-
-
    it "does not apply italics when there is a space inside" do
      PrettyText.cook("** hello**").should match_html "<p>** hello**</p>"
      PrettyText.cook("**hello **").should match_html "<p>**hello **</p>"
    end

-    pending "allows comments through" do
-      PrettyText.cook("boom <!--comment-->").should match_html "<p>boom <!--comment--></p>"
-    end
-
    it "allows does not bold chinese intra word" do
      PrettyText.cook("你**hello**").should match_html "<p>你**hello**</p>"
    end
@ -279,11 +270,6 @@ describe PrettyText do
    it "allows bold chinese" do
      PrettyText.cook("**你hello**").should match_html "<p><strong>你hello</strong></p>"
    end
-
-    pending "does not break a streak for mentions" do
-      Fabricate(:user, username: 'sam')
-      PrettyText.cook("<small>a @sam c</small>").should match_html "<p><small>a <a class='mention' href='/users/sam'>@sam</a> c</small></p>"
-    end
  end

 end
--- a/test/javascripts/lib/markdown_test.js
+++ b/test/javascripts/lib/markdown_test.js
@ -6,6 +6,8 @@ module("Discourse.Markdown", {

 var cooked = function(input, expected, text) {
  var result = Discourse.Markdown.cook(input, {sanitize: true});
+  expected = expected.replace(/\/>/g, ">");
+  // result = result.replace("/>", ">");
  equal(result, expected, text);
 };

@ -138,6 +140,8 @@ test("Links", function() {
  cooked("User [MOD]: Hello!",
         "<p>User [MOD]: Hello!</p>",
         "It does not consider references that are obviously not URLs");
+
+  cooked("<small>http://eviltrout.com</small>", "<p><small><a href=\"http://eviltrout.com\">http://eviltrout.com</a></small></p>", "Links within HTML tags");
 });

 test("simple quotes", function() {
@ -240,6 +244,9 @@ test("Mentions", function() {
                "<p><a class=\"mention\" href=\"/users/eviltrout\">@eviltrout</a></p>",
                "it doesn't onebox mentions");

+  cookedOptions("<small>a @sam c</small>", alwaysTrue,
+                "<p><small>a <a class=\"mention\" href=\"/users/sam\">@sam</a> c</small></p>",
+                "it allows mentions within HTML tags");
 });


@ -370,7 +377,7 @@ test("sanitize", function() {

  cooked("[the answer](javascript:alert(42))", "<p><a>the answer</a></p>", "it prevents XSS");

-  cooked("<i class=\"fa fa-bug fa-spin\" style=\"font-size:600%\"></i>\n<!-- -->", "<p><i></i><br/>&lt;!-- --&gt;</p>", "it doesn't circumvent XSS with comments");
+  cooked("<i class=\"fa fa-bug fa-spin\" style=\"font-size:600%\"></i>\n<!-- -->", "<p><i></i><br/></p>", "it doesn't circumvent XSS with comments");
 });

 test("URLs in BBCode tags", function() {
--- a/test/javascripts/mdtest/fixtures/Images.xhtml
+++ b/test/javascripts/mdtest/fixtures/Images.xhtml
@ -12,7 +12,7 @@

 <p><img src="/url/" alt="alt text" title="with a title" />.</p>

-<p><img src="" alt="Empty" /></p>
+<p><img alt="Empty" /></p>

 <p><img src="http://example.com/(parens).jpg" alt="this is a stupid URL" /></p>

--- a/test/javascripts/mdtest/fixtures/Links,
+++ b/test/javascripts/mdtest/fixtures/Links,
@ -10,9 +10,9 @@

 <p><a href="/url/">URL wrapped in angle brackets</a>.</p>

-<p><a href="/url/" title="Here&#39;s the title">URL w/ angle brackets + title</a>.</p>
+<p><a href="/url/" title="Here's the title">URL w/ angle brackets + title</a>.</p>

-<p><a href="">Empty</a>.</p>
+<p><a>Empty</a>.</p>

 <p><a href="http://en.wikipedia.org/wiki/WIMP_(computing)">With parens in the URL</a></p>

--- a/test/javascripts/mdtest/mdtest.js.erb
+++ b/test/javascripts/mdtest/mdtest.js.erb
@ -8,13 +8,42 @@ module("MDTest", {
 // do not affect formatting. 
 function normalize(str) {
  return str.replace(/\n\s*/g, '').
-             replace(/ \/\>/g, '/>').
+             replace(/ \/\>/g, '>').
             replace(/    ?/g, "\t").
             replace(/&#34;/g, '&quot;');
 }

+// We use a custom sanitizer for MD test that hoists out comments. In Discourse
+// they are stripped, but to be compliant with the spec they should not be.
+function hoistingSanitizer(result) {
+  var hoisted,
+      m = result.match(/<!--[\s\S]*?-->/g);
+  if (m && m.length) {
+    hoisted = [];
+    for (var i=0; i<m.length; i++) {
+      var c = m[i],
+          id = "discourse:hoisted-comment:" + i;
+      result = result.replace(c, id);
+      hoisted.push([c, id]);
+    }
+  }
+
+  result = Discourse.Markdown.sanitize(result);
+
+  if (hoisted) {
+    hoisted.forEach(function(tuple) {
+      result = result.replace(tuple[1], tuple[0]);
+    });
+  }
+
+  return result;
+}
+
 var md = function(input, expected, text) {
-  var result = Discourse.Markdown.cook(input, {sanitize: true, traditional_markdown_linebreaks: true}),
+  var result = Discourse.Markdown.cook(input, {
+        sanitizerFunction: hoistingSanitizer,
+        traditional_markdown_linebreaks: true
+      }),
      resultNorm = normalize(result),
      expectedNorm = normalize(expected),
      same = (result === expected) || (resultNorm === expectedNorm);