From f0497ee9c4902307e2fcc1586bade35b54c8cec2 Mon Sep 17 00:00:00 2001 From: Vinoth Kannan Date: Fri, 15 Dec 2017 10:28:20 +0530 Subject: [PATCH] FEATURE: HTML to Markdown conversion using native JavaScript ES6 classes (#5425) --- .../discourse/components/d-editor.js.es6 | 32 +- .../discourse}/helpers/parse-html.js.es6 | 2 +- .../discourse/lib/to-markdown.js.es6 | 285 ++++++++++++++++++ app/assets/javascripts/vendor.js | 1 + .../lib/category-badge-test.js.es6 | 4 +- test/javascripts/lib/to-markdown-test.js.es6 | 126 ++++++++ 6 files changed, 426 insertions(+), 24 deletions(-) rename {test/javascripts => app/assets/javascripts/discourse}/helpers/parse-html.js.es6 (99%) create mode 100644 app/assets/javascripts/discourse/lib/to-markdown.js.es6 create mode 100644 test/javascripts/lib/to-markdown-test.js.es6 diff --git a/app/assets/javascripts/discourse/components/d-editor.js.es6 b/app/assets/javascripts/discourse/components/d-editor.js.es6 index eda16824b70..ba821726cbe 100644 --- a/app/assets/javascripts/discourse/components/d-editor.js.es6 +++ b/app/assets/javascripts/discourse/components/d-editor.js.es6 @@ -9,6 +9,7 @@ import { emojiUrlFor } from 'discourse/lib/text'; import { getRegister } from 'discourse-common/lib/get-owner'; import { findRawTemplate } from 'discourse/lib/raw-templates'; import { determinePostReplaceSelection, clipboardData } from 'discourse/lib/utilities'; +import toMarkdown from 'discourse/lib/to-markdown'; import { ajax } from 'discourse/lib/ajax'; import { popupAjaxError } from 'discourse/lib/ajax-error'; import deprecated from 'discourse-common/lib/deprecated'; @@ -647,7 +648,7 @@ export default Ember.Component.extend({ const { clipboard, types } = clipboardData(e); let plainText = clipboard.getData("text/plain"); - const html = clipboard.getData("text/html"); + let html = clipboard.getData("text/html"); let handled = false; if (plainText) { @@ -657,30 +658,19 @@ export default Ember.Component.extend({ this.appEvents.trigger('composer:insert-text', table); handled = true; } + + if (html && html.includes("urn:schemas-microsoft-com:office:word")) { + html = ""; // use plain text data for microsoft word + } } if (this.siteSettings.enable_rich_text_paste && html && !handled) { - const placeholder = `${ plainText || I18n.t('pasting') }`; - const self = this; + const markdown = toMarkdown(html); - this.appEvents.trigger('composer:insert-text', placeholder); - handled = true; - - ajax('/composer/parse_html', { - type: 'POST', - data: { html } - }).then(response => { - if (response.markdown) { - self.appEvents.trigger('composer:replace-text', placeholder, response.markdown); - } else if (!plainText) { - self.appEvents.trigger('composer:replace-text', placeholder, ""); - } - }).catch(error => { - if (!plainText) { - self.appEvents.trigger('composer:replace-text', placeholder, ""); - popupAjaxError(error); - } - }); + if (!plainText || plainText.length < markdown.length) { + this.appEvents.trigger('composer:insert-text', markdown); + handled = true; + } } const uploadFiles = types.includes("Files") && !plainText && !handled; diff --git a/test/javascripts/helpers/parse-html.js.es6 b/app/assets/javascripts/discourse/helpers/parse-html.js.es6 similarity index 99% rename from test/javascripts/helpers/parse-html.js.es6 rename to app/assets/javascripts/discourse/helpers/parse-html.js.es6 index 0d3bab90d56..c9469fa6b9f 100644 --- a/test/javascripts/helpers/parse-html.js.es6 +++ b/app/assets/javascripts/discourse/helpers/parse-html.js.es6 @@ -5,4 +5,4 @@ export default function parseHTML(rawHtml) { parser.parseComplete(rawHtml); return builder.dom; -} \ No newline at end of file +} diff --git a/app/assets/javascripts/discourse/lib/to-markdown.js.es6 b/app/assets/javascripts/discourse/lib/to-markdown.js.es6 new file mode 100644 index 00000000000..c20a738d3ea --- /dev/null +++ b/app/assets/javascripts/discourse/lib/to-markdown.js.es6 @@ -0,0 +1,285 @@ +import parseHTML from 'discourse/helpers/parse-html'; + +const trimLeft = text => text.replace(/^\s+/,""); +const trimRight = text => text.replace(/\s+$/,""); + +class Tag { + constructor(name, prefix = "", suffix = "") { + this.name = name; + this.prefix = prefix; + this.suffix = suffix; + } + + decorate(text) { + if (this.prefix || this.suffix) { + return [this.prefix, text, this.suffix].join(""); + } + + return text; + } + + toMarkdown() { + const text = this.element.innerMarkdown(); + + if (text && text.trim()) { + return this.decorate(text); + } + + return text; + } + + static blocks() { + return ["address", "article", "aside", "blockquote", "dd", "div", "dl", "dt", "fieldset", + "figcaption", "figure", "footer", "form", "header", "hgroup", "hr", "main", "nav", + "ol", "p", "pre", "section", "table", "ul"]; + } + + static headings() { + return ["h1", "h2", "h3", "h4", "h5", "h6"]; + } + + static emphases() { + return [ ["b", "**"], ["strong", "**"], ["i", "_"], ["em", "_"], ["s", "~~"], ["strike", "~~"] ]; + } + + static slices() { + return ["dt", "dd", "tr", "thead", "tbody", "tfoot"]; + } + + static trimmable() { + return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr"]; + } + + static block(name, prefix, suffix) { + return class extends Tag { + constructor() { + super(name, prefix, suffix); + } + + decorate(text) { + return `\n\n${this.prefix}${text}${this.suffix}\n\n`; + } + }; + } + + static heading(name, i) { + const prefix = `${[...Array(i)].map(() => "#").join("")} `; + return Tag.block(name, prefix, ""); + } + + static emphasis(name, decorator) { + return class extends Tag { + constructor() { + super(name, decorator, decorator); + } + + decorate(text) { + text = text.trim(); + + if (text.includes("\n")) { + this.prefix = `<${this.name}>`; + this.suffix = ``; + } + + return super.decorate(text); + } + }; + } + + static replace(name, text) { + return class extends Tag { + constructor() { + super(name, "", ""); + this.text = text; + } + + toMarkdown() { + return this.text; + } + }; + } + + static link() { + return class extends Tag { + constructor() { + super("a"); + } + + decorate(text) { + const attr = this.element.attributes; + + if (attr && attr.href && text !== attr.href) { + return "[" + text + "](" + attr.href + ")"; + } + + return text; + } + }; + } + + static image() { + return class extends Tag { + constructor() { + super("img"); + } + + toMarkdown() { + const e = this.element; + const attr = e.attributes; + const pAttr = e.parent && e.parent.attributes; + const src = (attr && attr.src) || (pAttr && pAttr.src); + + if (src) { + const alt = (attr && attr.alt) || (pAttr && pAttr.alt) || ""; + return "![" + alt + "](" + src + ")"; + } + + return ""; + } + }; + } + + static slice(name, prefix, suffix) { + return class extends Tag { + constructor() { + super(name, prefix, suffix); + } + + decorate(text) { + if (!this.element.next) { + this.suffix = ""; + } + return `${text}${this.suffix}`; + } + }; + } + + static cell(name) { + return Tag.slice(name, "", " "); + } + + static li() { + return class extends Tag.slice("li", "", "\n") { + decorate(text) { + const indent = this.element.filterParentNames("ul").slice(1).map(() => " ").join(""); + return super.decorate(`${indent}* ${trimLeft(text)}`); + } + }; + } + +} + +const tags = [ + ...Tag.blocks().map((b) => Tag.block(b)), + ...Tag.headings().map((h, i) => Tag.heading(h, i + 1)), + ...Tag.slices().map((s) => Tag.slice(s, "", "\n")), + ...Tag.emphases().map((e) => Tag.emphasis(e[0], e[1])), + Tag.cell("td"), Tag.cell("th"), + Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""), + Tag.li(), Tag.link(), Tag.image(), + + // TO-DO CREATE: code, tbody, ins, del, blockquote, small, large + // UPDATE: ol, pre, thead, th, td +]; + +class Element { + constructor(element, parent, previous, next) { + this.name = element.name; + this.type = element.type; + this.data = element.data; + this.children = element.children; + this.attributes = element.attributes; + + if (parent) { + this.parent = parent; + this.parentNames = (parent.parentNames || []).slice(); + this.parentNames.push(parent.name); + } + this.previous = previous; + this.next = next; + } + + tag() { + const tag = new (tags.filter(t => (new t().name === this.name))[0] || Tag)(); + tag.element = this; + return tag; + } + + innerMarkdown() { + return Element.parseChildren(this); + } + + leftTrimmable() { + return this.previous && Tag.trimmable().includes(this.previous.name); + } + + rightTrimmable() { + return this.next && Tag.trimmable().includes(this.next.name); + } + + text() { + let text = this.data || ""; + + if (this.leftTrimmable()) { + text = trimLeft(text); + } + + if (this.rightTrimmable()) { + text = trimRight(text); + } + + text = text.replace(/[ \t]+/g, " "); + + return text; + } + + toMarkdown() { + switch(this.type) { + case "text": + return this.text(); + break; + case "tag": + return this.tag().toMarkdown(); + break; + } + } + + filterParentNames(name) { + return this.parentNames.filter(p => p === name); + } + + static toMarkdown(element, parent, prev, next) { + return new Element(element, parent, prev, next).toMarkdown(); + } + + static parseChildren(parent) { + return Element.parse(parent.children, parent); + } + + static parse(elements, parent = null) { + if (elements) { + let result = []; + + for (let i = 0; i < elements.length; i++) { + const prev = (i === 0) ? null : elements[i-1]; + const next = (i === elements.length) ? null : elements[i+1]; + + result.push(Element.toMarkdown(elements[i], parent, prev, next)); + } + + return result.join(""); + } + + return ""; + } +} + +export default function toMarkdown(html) { + try { + let markdown = Element.parse(parseHTML(html)).trim(); + markdown = markdown.replace(/^/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste + return markdown.replace(/\r/g, "").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n"); + } catch(err) { + return ""; + } +} diff --git a/app/assets/javascripts/vendor.js b/app/assets/javascripts/vendor.js index 4b500d18bc8..6a7bd345eca 100644 --- a/app/assets/javascripts/vendor.js +++ b/app/assets/javascripts/vendor.js @@ -37,3 +37,4 @@ //= require virtual-dom //= require virtual-dom-amd //= require highlight.js +//= require htmlparser.js diff --git a/test/javascripts/lib/category-badge-test.js.es6 b/test/javascripts/lib/category-badge-test.js.es6 index ffd837a5e61..6dc1ba2c513 100644 --- a/test/javascripts/lib/category-badge-test.js.es6 +++ b/test/javascripts/lib/category-badge-test.js.es6 @@ -2,7 +2,7 @@ import createStore from 'helpers/create-store'; QUnit.module("lib:category-link"); -import parseHTML from 'helpers/parse-html'; +import parseHTML from 'discourse/helpers/parse-html'; import { categoryBadgeHTML } from "discourse/helpers/category-link"; QUnit.test("categoryBadge without a category", assert => { @@ -44,4 +44,4 @@ QUnit.test("allowUncategorized", assert => { assert.blank(categoryBadgeHTML(uncategorized), "it doesn't return HTML for uncategorized by default"); assert.present(categoryBadgeHTML(uncategorized, {allowUncategorized: true}), "it returns HTML"); -}); \ No newline at end of file +}); diff --git a/test/javascripts/lib/to-markdown-test.js.es6 b/test/javascripts/lib/to-markdown-test.js.es6 new file mode 100644 index 00000000000..96ee32db5ac --- /dev/null +++ b/test/javascripts/lib/to-markdown-test.js.es6 @@ -0,0 +1,126 @@ +import toMarkdown from 'discourse/lib/to-markdown'; + +QUnit.module("lib:to-markdown"); + +QUnit.test("converts styles between normal words", assert => { + const html = `Line with styles between words.`; + const markdown = `Line with ~~styles~~ **_between_** words.`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("converts inline nested styles", assert => { + let html = `Italicised line with some random bold words.`; + let markdown = `_Italicised line with **some random** **bold** words._`; + assert.equal(toMarkdown(html), markdown); + + html = `Italicised line + with some + random bold words.`; + markdown = `Italicised line\n with some\n random ~~bold~~ words.`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("converts a link", assert => { + const html = `Discourse`; + const markdown = `[Discourse](https://discourse.org)`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("put raw URL instead of converting the link", assert => { + let url = "https://discourse.org"; + const html = () => `${url}`; + + assert.equal(toMarkdown(html()), url); + + url = "discourse.org/t/topic-slug/1"; + assert.equal(toMarkdown(html()), url); +}); + +QUnit.test("skip empty link", assert => { + assert.equal(toMarkdown(``), ""); +}); + +QUnit.test("converts heading tags", assert => { + const html = ` +

Heading 1

+

Heading 2

+ + \t

Heading 3

+ + +

Heading 4

+ + + +
Heading 5
+ + + + +
Heading 6
+ `; + const markdown = `# Heading 1\n\n## Heading 2\n\n### Heading 3\n\n#### Heading 4\n\n##### Heading 5\n\n###### Heading 6`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("converts ul and ol list tags", assert => { + const html = ` + + `; + const markdown = `* Item 1\n* Item 2\n\n * Sub Item 1\n * Sub Item 2\n\n * Sub _Sub_ Item 1\n * Sub **Sub** Item 2\n\n* Item 3`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("stripes unwanted inline tags", assert => { + const html = ` +

Lorem ipsum dolor sit amet, consectetur elit.

+

Ut minim veniam, laboris ut aliquip ex ea commodo.

+ `; + const markdown = `Lorem ipsum dolor sit amet, consectetur ~~elit.~~\n\nUt minim veniam, quis nostrud laboris ut aliquip ex ea commodo.`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("converts table as readable", assert => { + const html = `
Discourse Avenue
laboris + + + + + +
Heading 1Head 2
Loremipsum
dolor sit amet
+ `; + const markdown = `Discourse Avenue\n\n**laboris**\n\nHeading 1 Head 2\n\nLorem ipsum\n**dolor** _sit amet_`; + assert.equal(toMarkdown(html), markdown); +}); + +QUnit.test("converts img tag", assert => { + const url = "https://example.com/image.png"; + let html = ``; + assert.equal(toMarkdown(html), `![](${url})`); + + html = `
description
`; + assert.equal(toMarkdown(html), `![description](${url})`); + + html = `description`; + assert.equal(toMarkdown(html), `[![description](${url})](http://example.com)`); + + html = `description `; + assert.equal(toMarkdown(html), `[description ![](${url})](http://example.com)`); + + html = `description`; + assert.equal(toMarkdown(html), ""); + + html = `description`; + assert.equal(toMarkdown(html), `![description](${url})`); +});