From f0497ee9c4902307e2fcc1586bade35b54c8cec2 Mon Sep 17 00:00:00 2001
From: Vinoth Kannan <vinoth.kannan@discourse.org>
Date: Fri, 15 Dec 2017 10:28:20 +0530
Subject: [PATCH] FEATURE: HTML to Markdown conversion using native JavaScript
 ES6 classes (#5425)

---
 .../discourse/components/d-editor.js.es6      |  32 +-
 .../discourse}/helpers/parse-html.js.es6      |   2 +-
 .../discourse/lib/to-markdown.js.es6          | 285 ++++++++++++++++++
 app/assets/javascripts/vendor.js              |   1 +
 .../lib/category-badge-test.js.es6            |   4 +-
 test/javascripts/lib/to-markdown-test.js.es6  | 126 ++++++++
 6 files changed, 426 insertions(+), 24 deletions(-)
 rename {test/javascripts => app/assets/javascripts/discourse}/helpers/parse-html.js.es6 (99%)
 create mode 100644 app/assets/javascripts/discourse/lib/to-markdown.js.es6
 create mode 100644 test/javascripts/lib/to-markdown-test.js.es6

diff --git a/app/assets/javascripts/discourse/components/d-editor.js.es6 b/app/assets/javascripts/discourse/components/d-editor.js.es6
index eda16824b70..ba821726cbe 100644
--- a/app/assets/javascripts/discourse/components/d-editor.js.es6
+++ b/app/assets/javascripts/discourse/components/d-editor.js.es6
@@ -9,6 +9,7 @@ import { emojiUrlFor } from 'discourse/lib/text';
 import { getRegister } from 'discourse-common/lib/get-owner';
 import { findRawTemplate } from 'discourse/lib/raw-templates';
 import { determinePostReplaceSelection, clipboardData } from 'discourse/lib/utilities';
+import toMarkdown from 'discourse/lib/to-markdown';
 import { ajax } from 'discourse/lib/ajax';
 import { popupAjaxError } from 'discourse/lib/ajax-error';
 import deprecated from 'discourse-common/lib/deprecated';
@@ -647,7 +648,7 @@ export default Ember.Component.extend({
 
     const { clipboard, types } = clipboardData(e);
     let plainText = clipboard.getData("text/plain");
-    const html = clipboard.getData("text/html");
+    let html = clipboard.getData("text/html");
     let handled = false;
 
     if (plainText) {
@@ -657,30 +658,19 @@ export default Ember.Component.extend({
         this.appEvents.trigger('composer:insert-text', table);
         handled = true;
       }
+
+      if (html && html.includes("urn:schemas-microsoft-com:office:word")) {
+        html = ""; // use plain text data for microsoft word
+      }
     }
 
     if (this.siteSettings.enable_rich_text_paste && html && !handled) {
-      const placeholder = `${ plainText || I18n.t('pasting') }`;
-      const self = this;
+      const markdown = toMarkdown(html);
 
-      this.appEvents.trigger('composer:insert-text', placeholder);
-      handled = true;
-
-      ajax('/composer/parse_html', {
-        type: 'POST',
-        data: { html }
-      }).then(response => {
-        if (response.markdown) {
-          self.appEvents.trigger('composer:replace-text', placeholder, response.markdown);
-        } else if (!plainText) {
-          self.appEvents.trigger('composer:replace-text', placeholder, "");
-        }
-      }).catch(error => {
-        if (!plainText) {
-          self.appEvents.trigger('composer:replace-text', placeholder, "");
-          popupAjaxError(error);
-        }
-      });
+      if (!plainText || plainText.length < markdown.length) {
+        this.appEvents.trigger('composer:insert-text', markdown);
+        handled = true;
+      }
     }
 
     const uploadFiles = types.includes("Files") && !plainText && !handled;
diff --git a/test/javascripts/helpers/parse-html.js.es6 b/app/assets/javascripts/discourse/helpers/parse-html.js.es6
similarity index 99%
rename from test/javascripts/helpers/parse-html.js.es6
rename to app/assets/javascripts/discourse/helpers/parse-html.js.es6
index 0d3bab90d56..c9469fa6b9f 100644
--- a/test/javascripts/helpers/parse-html.js.es6
+++ b/app/assets/javascripts/discourse/helpers/parse-html.js.es6
@@ -5,4 +5,4 @@ export default function parseHTML(rawHtml) {
 
   parser.parseComplete(rawHtml);
   return builder.dom;
-}
\ No newline at end of file
+}
diff --git a/app/assets/javascripts/discourse/lib/to-markdown.js.es6 b/app/assets/javascripts/discourse/lib/to-markdown.js.es6
new file mode 100644
index 00000000000..c20a738d3ea
--- /dev/null
+++ b/app/assets/javascripts/discourse/lib/to-markdown.js.es6
@@ -0,0 +1,285 @@
+import parseHTML from 'discourse/helpers/parse-html';
+
+const trimLeft = text => text.replace(/^\s+/,"");
+const trimRight = text => text.replace(/\s+$/,"");
+
+class Tag {
+  constructor(name, prefix = "", suffix = "") {
+    this.name = name;
+    this.prefix = prefix;
+    this.suffix = suffix;
+  }
+
+  decorate(text) {
+    if (this.prefix || this.suffix) {
+      return [this.prefix, text, this.suffix].join("");
+    }
+
+    return text;
+  }
+
+  toMarkdown() {
+    const text = this.element.innerMarkdown();
+
+    if (text && text.trim()) {
+      return this.decorate(text);
+    }
+
+    return text;
+  }
+
+  static blocks() {
+    return ["address", "article", "aside", "blockquote", "dd", "div", "dl", "dt", "fieldset",
+            "figcaption", "figure", "footer", "form", "header", "hgroup", "hr", "main", "nav",
+            "ol", "p", "pre", "section", "table", "ul"];
+  }
+
+  static headings() {
+    return ["h1", "h2", "h3", "h4", "h5", "h6"];
+  }
+
+  static emphases() {
+    return  [ ["b", "**"], ["strong", "**"], ["i", "_"], ["em", "_"], ["s", "~~"], ["strike", "~~"] ];
+  }
+
+  static slices() {
+    return ["dt", "dd", "tr", "thead", "tbody", "tfoot"];
+  }
+
+  static trimmable() {
+    return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr"];
+  }
+
+  static block(name, prefix, suffix) {
+    return class extends Tag {
+      constructor() {
+        super(name, prefix, suffix);
+      }
+
+      decorate(text) {
+        return `\n\n${this.prefix}${text}${this.suffix}\n\n`;
+      }
+    };
+  }
+
+  static heading(name, i) {
+    const prefix = `${[...Array(i)].map(() => "#").join("")} `;
+    return Tag.block(name, prefix, "");
+  }
+
+  static emphasis(name, decorator) {
+    return class extends Tag {
+      constructor() {
+        super(name, decorator, decorator);
+      }
+
+      decorate(text) {
+        text = text.trim();
+
+        if (text.includes("\n")) {
+          this.prefix = `<${this.name}>`;
+          this.suffix = `</${this.name}>`;
+        }
+
+        return super.decorate(text);
+      }
+    };
+  }
+
+  static replace(name, text) {
+    return class extends Tag {
+      constructor() {
+        super(name, "", "");
+        this.text = text;
+      }
+
+      toMarkdown() {
+        return this.text;
+      }
+    };
+  }
+
+  static link() {
+    return class extends Tag {
+      constructor() {
+        super("a");
+      }
+
+      decorate(text) {
+        const attr = this.element.attributes;
+
+        if (attr && attr.href && text !== attr.href) {
+          return "[" + text + "](" + attr.href + ")";
+        }
+
+        return text;
+      }
+    };
+  }
+
+  static image() {
+    return class extends Tag {
+      constructor() {
+        super("img");
+      }
+
+      toMarkdown() {
+        const e = this.element;
+        const attr = e.attributes;
+        const pAttr = e.parent && e.parent.attributes;
+        const src = (attr && attr.src) || (pAttr && pAttr.src);
+
+        if (src) {
+          const alt = (attr && attr.alt) || (pAttr && pAttr.alt) || "";
+          return "![" + alt + "](" + src + ")";
+        }
+
+        return "";
+      }
+    };
+  }
+
+  static slice(name, prefix, suffix) {
+    return class extends Tag {
+      constructor() {
+        super(name, prefix, suffix);
+      }
+
+      decorate(text) {
+        if (!this.element.next) {
+          this.suffix = "";
+        }
+        return `${text}${this.suffix}`;
+      }
+    };
+  }
+
+  static cell(name) {
+    return Tag.slice(name, "", " ");
+  }
+
+  static li() {
+    return class extends Tag.slice("li", "", "\n") {
+      decorate(text) {
+        const indent = this.element.filterParentNames("ul").slice(1).map(() => "  ").join("");
+        return super.decorate(`${indent}* ${trimLeft(text)}`);
+      }
+    };
+  }
+
+}
+
+const tags = [
+  ...Tag.blocks().map((b) => Tag.block(b)),
+  ...Tag.headings().map((h, i) => Tag.heading(h, i + 1)),
+  ...Tag.slices().map((s) => Tag.slice(s, "", "\n")),
+  ...Tag.emphases().map((e) => Tag.emphasis(e[0], e[1])),
+  Tag.cell("td"), Tag.cell("th"),
+  Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""),
+  Tag.li(), Tag.link(), Tag.image(),
+
+  // TO-DO  CREATE: code, tbody, ins, del, blockquote, small, large
+  //        UPDATE: ol, pre, thead, th, td
+];
+
+class Element {
+  constructor(element, parent, previous, next) {
+    this.name = element.name;
+    this.type = element.type;
+    this.data = element.data;
+    this.children = element.children;
+    this.attributes = element.attributes;
+
+    if (parent) {
+      this.parent = parent;
+      this.parentNames = (parent.parentNames || []).slice();
+      this.parentNames.push(parent.name);
+    }
+    this.previous = previous;
+    this.next = next;
+  }
+
+  tag() {
+    const tag = new (tags.filter(t => (new t().name === this.name))[0] || Tag)();
+    tag.element = this;
+    return tag;
+  }
+
+  innerMarkdown() {
+    return Element.parseChildren(this);
+  }
+
+  leftTrimmable() {
+    return this.previous && Tag.trimmable().includes(this.previous.name);
+  }
+
+  rightTrimmable() {
+    return this.next && Tag.trimmable().includes(this.next.name);
+  }
+
+  text() {
+    let text = this.data || "";
+
+    if (this.leftTrimmable()) {
+      text = trimLeft(text);
+    }
+
+    if (this.rightTrimmable()) {
+      text = trimRight(text);
+    }
+
+    text = text.replace(/[ \t]+/g, " ");
+
+    return text;
+  }
+
+  toMarkdown() {
+    switch(this.type) {
+      case "text":
+        return this.text();
+        break;
+      case "tag":
+        return this.tag().toMarkdown();
+        break;
+    }
+  }
+
+  filterParentNames(name) {
+    return this.parentNames.filter(p => p === name);
+  }
+
+  static toMarkdown(element, parent, prev, next) {
+    return new Element(element, parent, prev, next).toMarkdown();
+  }
+
+  static parseChildren(parent) {
+    return Element.parse(parent.children, parent);
+  }
+
+  static parse(elements, parent = null) {
+    if (elements) {
+      let result = [];
+
+      for (let i = 0; i < elements.length; i++) {
+        const prev = (i === 0) ? null : elements[i-1];
+        const next = (i === elements.length) ? null : elements[i+1];
+
+        result.push(Element.toMarkdown(elements[i], parent, prev, next));
+      }
+
+      return result.join("");
+    }
+
+    return "";
+  }
+}
+
+export default function toMarkdown(html) {
+  try {
+    let markdown = Element.parse(parseHTML(html)).trim();
+    markdown = markdown.replace(/^<b>/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste
+    return markdown.replace(/\r/g, "").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
+  } catch(err) {
+    return "";
+  }
+}
diff --git a/app/assets/javascripts/vendor.js b/app/assets/javascripts/vendor.js
index 4b500d18bc8..6a7bd345eca 100644
--- a/app/assets/javascripts/vendor.js
+++ b/app/assets/javascripts/vendor.js
@@ -37,3 +37,4 @@
 //= require virtual-dom
 //= require virtual-dom-amd
 //= require highlight.js
+//= require htmlparser.js
diff --git a/test/javascripts/lib/category-badge-test.js.es6 b/test/javascripts/lib/category-badge-test.js.es6
index ffd837a5e61..6dc1ba2c513 100644
--- a/test/javascripts/lib/category-badge-test.js.es6
+++ b/test/javascripts/lib/category-badge-test.js.es6
@@ -2,7 +2,7 @@ import createStore from 'helpers/create-store';
 
 QUnit.module("lib:category-link");
 
-import parseHTML from 'helpers/parse-html';
+import parseHTML from 'discourse/helpers/parse-html';
 import { categoryBadgeHTML } from "discourse/helpers/category-link";
 
 QUnit.test("categoryBadge without a category", assert => {
@@ -44,4 +44,4 @@ QUnit.test("allowUncategorized", assert => {
 
   assert.blank(categoryBadgeHTML(uncategorized), "it doesn't return HTML for uncategorized by default");
   assert.present(categoryBadgeHTML(uncategorized, {allowUncategorized: true}), "it returns HTML");
-});
\ No newline at end of file
+});
diff --git a/test/javascripts/lib/to-markdown-test.js.es6 b/test/javascripts/lib/to-markdown-test.js.es6
new file mode 100644
index 00000000000..96ee32db5ac
--- /dev/null
+++ b/test/javascripts/lib/to-markdown-test.js.es6
@@ -0,0 +1,126 @@
+import toMarkdown from 'discourse/lib/to-markdown';
+
+QUnit.module("lib:to-markdown");
+
+QUnit.test("converts styles between normal words", assert => {
+  const html = `Line with <s>styles</s> <b><i>between</i></b> words.`;
+  const markdown = `Line with ~~styles~~ **_between_** words.`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("converts inline nested styles", assert => {
+  let html = `<em>Italicised line with <strong>some random</strong> <b>bold</b> words.</em>`;
+  let markdown = `_Italicised line with **some random** **bold** words._`;
+  assert.equal(toMarkdown(html), markdown);
+
+  html = `<i class="fa">Italicised line
+   with <b title="strong">some
+   random</b> <s>bold</s> words.</i>`;
+  markdown = `<i>Italicised line\n with <b>some\n random</b> ~~bold~~ words.</i>`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("converts a link", assert => {
+  const html = `<a href="https://discourse.org">Discourse</a>`;
+  const markdown = `[Discourse](https://discourse.org)`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("put raw URL instead of converting the link", assert => {
+  let url = "https://discourse.org";
+  const html = () => `<a href="${url}">${url}</a>`;
+
+  assert.equal(toMarkdown(html()), url);
+
+  url = "discourse.org/t/topic-slug/1";
+  assert.equal(toMarkdown(html()), url);
+});
+
+QUnit.test("skip empty link", assert => {
+  assert.equal(toMarkdown(`<a href="https://example.com"></a>`), "");
+});
+
+QUnit.test("converts heading tags", assert => {
+  const html = `
+  <h1>Heading 1</h1>
+  <h2>Heading 2</h2>
+
+  \t  <h3>Heading 3</h3>
+
+
+  <h4>Heading 4</h4>
+
+
+
+<h5>Heading 5</h5>
+
+
+
+
+<h6>Heading 6</h6>
+  `;
+  const markdown = `# Heading 1\n\n## Heading 2\n\n### Heading 3\n\n#### Heading 4\n\n##### Heading 5\n\n###### Heading 6`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("converts ul and ol list tags", assert => {
+  const html = `
+  <ul>
+    <li>Item 1</li>
+    <li>
+      Item 2
+      <ul>
+        <li>Sub Item 1</li>
+        <li>Sub Item 2</li>
+        <ul><li>Sub <i>Sub</i> Item 1</li><li>Sub <b>Sub</b> Item 2</li></ul>
+      </ul>
+    </li>
+    <li>Item 3</li>
+  </ul>
+  `;
+  const markdown = `* Item 1\n* Item 2\n\n  * Sub Item 1\n  * Sub Item 2\n\n    * Sub _Sub_ Item 1\n    * Sub **Sub** Item 2\n\n* Item 3`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("stripes unwanted inline tags", assert => {
+  const html = `
+  <p>Lorem ipsum <span>dolor sit amet, consectetur</span> <strike>elit.</strike></p>
+  <p>Ut minim veniam, <label>quis nostrud</label> laboris <nisi> ut aliquip ex ea</nisi> commodo.</p>
+  `;
+  const markdown = `Lorem ipsum dolor sit amet, consectetur ~~elit.~~\n\nUt minim veniam, quis nostrud laboris  ut aliquip ex ea commodo.`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("converts table as readable", assert => {
+  const html = `<address>Discourse Avenue</address><b>laboris</b>
+  <table>
+    <thead> <tr><th>Heading 1</th><th>Head 2</th></tr> </thead>
+      <tbody>
+        <tr><td>Lorem</td><td>ipsum</td></tr>
+        <tr><td><b>dolor</b></td> <td><i>sit amet</i></td></tr></tbody>
+</table>
+  `;
+  const markdown = `Discourse Avenue\n\n**laboris**\n\nHeading 1 Head 2\n\nLorem ipsum\n**dolor** _sit amet_`;
+  assert.equal(toMarkdown(html), markdown);
+});
+
+QUnit.test("converts img tag", assert => {
+  const url = "https://example.com/image.png";
+  let html = `<img src="${url}">`;
+  assert.equal(toMarkdown(html), `![](${url})`);
+
+  html = `<div><span><img src="${url}" alt="description" /></span></div>`;
+  assert.equal(toMarkdown(html), `![description](${url})`);
+
+  html = `<a href="http://example.com"><img src="${url}" alt="description" /></a>`;
+  assert.equal(toMarkdown(html), `[![description](${url})](http://example.com)`);
+
+  html = `<a href="http://example.com">description <img src="${url}" /></a>`;
+  assert.equal(toMarkdown(html), `[description ![](${url})](http://example.com)`);
+
+  html = `<img alt="description" />`;
+  assert.equal(toMarkdown(html), "");
+
+  html = `<a><img src="${url}" alt="description" /></a>`;
+  assert.equal(toMarkdown(html), `![description](${url})`);
+});