Remove unwanted spaces between HTML tags and support Word documents

This commit is contained in:
Vinoth Kannan 2017-12-22 09:28:24 +05:30
parent 3bc53f2946
commit 4935ae4338
3 changed files with 51 additions and 26 deletions

View File

@ -662,8 +662,6 @@ export default Ember.Component.extend({
if (table) { if (table) {
this.appEvents.trigger('composer:insert-text', table); this.appEvents.trigger('composer:insert-text', table);
handled = true; handled = true;
} else if (html && html.includes("urn:schemas-microsoft-com:office:word")) {
html = ""; // use plain text data for microsoft word
} }
} }

View File

@ -38,15 +38,15 @@ class Tag {
} }
static emphases() { static emphases() {
return [ ["b", "**"], ["strong", "**"], ["i", "_"], ["em", "_"], ["s", "~~"], ["strike", "~~"] ]; return [ ["b", "**"], ["strong", "**"], ["i", "*"], ["em", "*"], ["s", "~~"], ["strike", "~~"] ];
} }
static slices() { static slices() {
return ["dt", "dd", "tr", "thead", "tbody", "tfoot"]; return ["dt", "dd", "thead", "tbody", "tfoot"];
} }
static trimmable() { static trimmable() {
return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr", "blockquote", "table", "ol"]; return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr", "blockquote", "table", "ol", "tr"];
} }
static block(name, prefix, suffix) { static block(name, prefix, suffix) {
@ -73,14 +73,17 @@ class Tag {
} }
decorate(text) { decorate(text) {
text = text.trim();
if (text.includes("\n")) { if (text.includes("\n")) {
this.prefix = `<${this.name}>`; this.prefix = `<${this.name}>`;
this.suffix = `</${this.name}>`; this.suffix = `</${this.name}>`;
} }
return super.decorate(text); let space = text.match(/^\s/) || [""];
this.prefix = space[0] + this.prefix;
space = text.match(/\s$/) || [""];
this.suffix = this.suffix + space[0];
return super.decorate(text.trim());
} }
}; };
} }
@ -182,10 +185,6 @@ class Tag {
throw "Unsupported format inside Markdown table cells"; throw "Unsupported format inside Markdown table cells";
} }
if (!this.element.next) {
this.suffix = "|";
}
return this.decorate(text); return this.decorate(text);
} }
}; };
@ -268,6 +267,17 @@ class Tag {
}; };
} }
static tr() {
return class extends Tag.slice("tr", "|\n") {
decorate(text) {
if (!this.element.next) {
this.suffix = "|";
}
return `${text}${this.suffix}`;
}
};
}
} }
const tags = [ const tags = [
@ -278,7 +288,7 @@ const tags = [
Tag.cell("td"), Tag.cell("th"), Tag.cell("td"), Tag.cell("th"),
Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""), Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""),
Tag.keep("ins"), Tag.keep("del"), Tag.keep("small"), Tag.keep("big"), Tag.keep("ins"), Tag.keep("del"), Tag.keep("small"), Tag.keep("big"),
Tag.li(), Tag.link(), Tag.image(), Tag.code(), Tag.blockquote(), Tag.table(),, Tag.ol(), Tag.li(), Tag.link(), Tag.image(), Tag.code(), Tag.blockquote(), Tag.table(), Tag.ol(), Tag.tr(),
]; ];
class Element { class Element {
@ -375,6 +385,19 @@ class Element {
} }
} }
function trimUnwantedSpaces(html) {
const body = html.match(/<body[^>]*>([\s\S]*?)<\/body>/);
html = body ? body[1] : html;
html = html.replace(/\r|\n|&nbsp;/g, " ");
let match;
while (match = html.match(/<[^\s>]+[^>]*>\s{2,}<[^\s>]+[^>]*>/)) {
html = html.replace(match[0], match[0].replace(/>\s{2,}</, "> <"));
}
return html;
}
function putPlaceholders(html) { function putPlaceholders(html) {
const codeRegEx = /<code[^>]*>([\s\S]*?)<\/code>/gi; const codeRegEx = /<code[^>]*>([\s\S]*?)<\/code>/gi;
const origHtml = html; const origHtml = html;
@ -390,7 +413,7 @@ function putPlaceholders(html) {
match = codeRegEx.exec(origHtml); match = codeRegEx.exec(origHtml);
} }
const elements = parseHTML(html); const elements = parseHTML(trimUnwantedSpaces(html));
return { elements, placeholders }; return { elements, placeholders };
} }
@ -406,7 +429,7 @@ export default function toMarkdown(html) {
const { elements, placeholders } = putPlaceholders(html); const { elements, placeholders } = putPlaceholders(html);
let markdown = Element.parse(elements).trim(); let markdown = Element.parse(elements).trim();
markdown = markdown.replace(/^<b>/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste markdown = markdown.replace(/^<b>/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste
markdown = markdown.replace(/\r/g, "").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n"); markdown = markdown.replace(/ +\n/g, "\n").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
return replacePlaceholders(markdown, placeholders); return replacePlaceholders(markdown, placeholders);
} catch(err) { } catch(err) {
return ""; return "";

View File

@ -4,19 +4,21 @@ QUnit.module("lib:to-markdown");
QUnit.test("converts styles between normal words", assert => { QUnit.test("converts styles between normal words", assert => {
const html = `Line with <s>styles</s> <b><i>between</i></b> words.`; const html = `Line with <s>styles</s> <b><i>between</i></b> words.`;
const markdown = `Line with ~~styles~~ **_between_** words.`; const markdown = `Line with ~~styles~~ ***between*** words.`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
assert.equal(toMarkdown("A <b>bold </b>word"), "A **bold** word");
}); });
QUnit.test("converts inline nested styles", assert => { QUnit.test("converts inline nested styles", assert => {
let html = `<em>Italicised line with <strong>some random</strong> <b>bold</b> words.</em>`; let html = `<em>Italicised line with <strong>some random</strong> <b>bold</b> words.</em>`;
let markdown = `_Italicised line with **some random** **bold** words._`; let markdown = `*Italicised line with **some random** **bold** words.*`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
html = `<i class="fa">Italicised line html = `<i class="fa">Italicised line
with <b title="strong">some with <b title="strong">some<br>
random</b> <s>bold</s> words.</i>`; random</b> <s>bold</s> words.</i>`;
markdown = `<i>Italicised line\n with <b>some\n random</b> ~~bold~~ words.</i>`; markdown = `<i>Italicised line with <b>some\nrandom</b> ~~bold~~ words.</i>`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
}); });
@ -26,7 +28,7 @@ QUnit.test("converts a link", assert => {
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
html = `<a href="https://discourse.org">Disc\n\n\nour\n\nse</a>`; html = `<a href="https://discourse.org">Disc\n\n\nour\n\nse</a>`;
markdown = `[Disc\nour\nse](https://discourse.org)`; markdown = `[Disc our se](https://discourse.org)`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
}); });
@ -82,7 +84,7 @@ QUnit.test("converts ul list tag", assert => {
<li>Item 3</li> <li>Item 3</li>
</ul> </ul>
`; `;
const markdown = `* Item 1\n* Item 2\n\n * Sub Item 1\n * Sub Item 2\n\n * Sub _Sub_ Item 1\n * Sub **Sub** Item 2\n\n* Item 3`; const markdown = `* Item 1\n* Item 2\n\n * Sub Item 1\n * Sub Item 2\n\n * Sub *Sub* Item 1\n * Sub **Sub** Item 2\n\n* Item 3`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
}); });
@ -101,10 +103,12 @@ QUnit.test("converts table tags", assert => {
<thead> <tr><th>Heading 1</th><th>Head 2</th></tr> </thead> <thead> <tr><th>Heading 1</th><th>Head 2</th></tr> </thead>
<tbody> <tbody>
<tr><td>Lorem</td><td>ipsum</td></tr> <tr><td>Lorem</td><td>ipsum</td></tr>
<tr><td><b>dolor</b></td> <td><i>sit amet</i></td></tr></tbody> <tr><td><b>dolor</b></td> <td><i>sit amet</i></td> </tr>
</tbody>
</table> </table>
`; `;
const markdown = `Discourse Avenue\n\n**laboris**\n\n|Heading 1|Head 2|\n| --- | --- |\n|Lorem|ipsum|\n|**dolor**|_sit amet_|`; const markdown = `Discourse Avenue\n\n**laboris**\n\n|Heading 1|Head 2|\n| --- | --- |\n|Lorem|ipsum|\n|**dolor**|*sit amet*|`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
}); });
@ -164,11 +168,11 @@ QUnit.test("supporting html tags by keeping them", assert => {
output = `[Lorem <del>ipsum dolor</del> sit](http://example.com).`; output = `[Lorem <del>ipsum dolor</del> sit](http://example.com).`;
assert.equal(toMarkdown(html), output); assert.equal(toMarkdown(html), output);
html = `Lorem <del>ipsum \n\n dolor</del> sit.`; html = `Lorem <del>ipsum dolor</del> sit.`;
assert.equal(toMarkdown(html), html); assert.equal(toMarkdown(html), html);
html = `Lorem <a href="http://example.com"><del>ipsum \n\n\n dolor</del> sit.</a>`; html = `Lorem <a href="http://example.com"><del>ipsum \n\n\n dolor</del> sit.</a>`;
output = `Lorem [<del>ipsum \n dolor</del> sit.](http://example.com)`; output = `Lorem [<del>ipsum dolor</del> sit.](http://example.com)`;
assert.equal(toMarkdown(html), output); assert.equal(toMarkdown(html), output);
}); });
@ -223,6 +227,6 @@ QUnit.test("converts ol list tag", assert => {
<li>Item 3</li> <li>Item 3</li>
</ol> </ol>
`; `;
const markdown = `Testing\n\n1. Item 1\n2. Item 2\n\n 100. Sub Item 1\n 101. Sub Item 2\n\n * Sub _Sub_ Item 1\n * Sub **Sub** Item 2\n\n3. Item 3`; const markdown = `Testing\n\n1. Item 1\n2. Item 2\n\n 100. Sub Item 1\n 101. Sub Item 2\n\n * Sub *Sub* Item 1\n * Sub **Sub** Item 2\n\n3. Item 3`;
assert.equal(toMarkdown(html), markdown); assert.equal(toMarkdown(html), markdown);
}); });