Remove unwanted spaces between HTML tags and support Word documents

This commit is contained in:
Vinoth Kannan 2017-12-22 09:28:24 +05:30
parent 3bc53f2946
commit 4935ae4338
3 changed files with 51 additions and 26 deletions

View File

@ -662,8 +662,6 @@ export default Ember.Component.extend({
if (table) {
this.appEvents.trigger('composer:insert-text', table);
handled = true;
} else if (html && html.includes("urn:schemas-microsoft-com:office:word")) {
html = ""; // use plain text data for microsoft word
}
}

View File

@ -38,15 +38,15 @@ class Tag {
}
static emphases() {
return [ ["b", "**"], ["strong", "**"], ["i", "_"], ["em", "_"], ["s", "~~"], ["strike", "~~"] ];
return [ ["b", "**"], ["strong", "**"], ["i", "*"], ["em", "*"], ["s", "~~"], ["strike", "~~"] ];
}
static slices() {
return ["dt", "dd", "tr", "thead", "tbody", "tfoot"];
return ["dt", "dd", "thead", "tbody", "tfoot"];
}
static trimmable() {
return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr", "blockquote", "table", "ol"];
return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr", "blockquote", "table", "ol", "tr"];
}
static block(name, prefix, suffix) {
@ -73,14 +73,17 @@ class Tag {
}
decorate(text) {
text = text.trim();
if (text.includes("\n")) {
this.prefix = `<${this.name}>`;
this.suffix = `</${this.name}>`;
}
return super.decorate(text);
let space = text.match(/^\s/) || [""];
this.prefix = space[0] + this.prefix;
space = text.match(/\s$/) || [""];
this.suffix = this.suffix + space[0];
return super.decorate(text.trim());
}
};
}
@ -182,10 +185,6 @@ class Tag {
throw "Unsupported format inside Markdown table cells";
}
if (!this.element.next) {
this.suffix = "|";
}
return this.decorate(text);
}
};
@ -268,6 +267,17 @@ class Tag {
};
}
static tr() {
return class extends Tag.slice("tr", "|\n") {
decorate(text) {
if (!this.element.next) {
this.suffix = "|";
}
return `${text}${this.suffix}`;
}
};
}
}
const tags = [
@ -278,7 +288,7 @@ const tags = [
Tag.cell("td"), Tag.cell("th"),
Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""),
Tag.keep("ins"), Tag.keep("del"), Tag.keep("small"), Tag.keep("big"),
Tag.li(), Tag.link(), Tag.image(), Tag.code(), Tag.blockquote(), Tag.table(),, Tag.ol(),
Tag.li(), Tag.link(), Tag.image(), Tag.code(), Tag.blockquote(), Tag.table(), Tag.ol(), Tag.tr(),
];
class Element {
@ -375,6 +385,19 @@ class Element {
}
}
function trimUnwantedSpaces(html) {
const body = html.match(/<body[^>]*>([\s\S]*?)<\/body>/);
html = body ? body[1] : html;
html = html.replace(/\r|\n|&nbsp;/g, " ");
let match;
while (match = html.match(/<[^\s>]+[^>]*>\s{2,}<[^\s>]+[^>]*>/)) {
html = html.replace(match[0], match[0].replace(/>\s{2,}</, "> <"));
}
return html;
}
function putPlaceholders(html) {
const codeRegEx = /<code[^>]*>([\s\S]*?)<\/code>/gi;
const origHtml = html;
@ -390,7 +413,7 @@ function putPlaceholders(html) {
match = codeRegEx.exec(origHtml);
}
const elements = parseHTML(html);
const elements = parseHTML(trimUnwantedSpaces(html));
return { elements, placeholders };
}
@ -406,7 +429,7 @@ export default function toMarkdown(html) {
const { elements, placeholders } = putPlaceholders(html);
let markdown = Element.parse(elements).trim();
markdown = markdown.replace(/^<b>/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste
markdown = markdown.replace(/\r/g, "").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
markdown = markdown.replace(/ +\n/g, "\n").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
return replacePlaceholders(markdown, placeholders);
} catch(err) {
return "";

View File

@ -4,19 +4,21 @@ QUnit.module("lib:to-markdown");
QUnit.test("converts styles between normal words", assert => {
const html = `Line with <s>styles</s> <b><i>between</i></b> words.`;
const markdown = `Line with ~~styles~~ **_between_** words.`;
const markdown = `Line with ~~styles~~ ***between*** words.`;
assert.equal(toMarkdown(html), markdown);
assert.equal(toMarkdown("A <b>bold </b>word"), "A **bold** word");
});
QUnit.test("converts inline nested styles", assert => {
let html = `<em>Italicised line with <strong>some random</strong> <b>bold</b> words.</em>`;
let markdown = `_Italicised line with **some random** **bold** words._`;
let markdown = `*Italicised line with **some random** **bold** words.*`;
assert.equal(toMarkdown(html), markdown);
html = `<i class="fa">Italicised line
with <b title="strong">some
with <b title="strong">some<br>
random</b> <s>bold</s> words.</i>`;
markdown = `<i>Italicised line\n with <b>some\n random</b> ~~bold~~ words.</i>`;
markdown = `<i>Italicised line with <b>some\nrandom</b> ~~bold~~ words.</i>`;
assert.equal(toMarkdown(html), markdown);
});
@ -26,7 +28,7 @@ QUnit.test("converts a link", assert => {
assert.equal(toMarkdown(html), markdown);
html = `<a href="https://discourse.org">Disc\n\n\nour\n\nse</a>`;
markdown = `[Disc\nour\nse](https://discourse.org)`;
markdown = `[Disc our se](https://discourse.org)`;
assert.equal(toMarkdown(html), markdown);
});
@ -82,7 +84,7 @@ QUnit.test("converts ul list tag", assert => {
<li>Item 3</li>
</ul>
`;
const markdown = `* Item 1\n* Item 2\n\n * Sub Item 1\n * Sub Item 2\n\n * Sub _Sub_ Item 1\n * Sub **Sub** Item 2\n\n* Item 3`;
const markdown = `* Item 1\n* Item 2\n\n * Sub Item 1\n * Sub Item 2\n\n * Sub *Sub* Item 1\n * Sub **Sub** Item 2\n\n* Item 3`;
assert.equal(toMarkdown(html), markdown);
});
@ -101,10 +103,12 @@ QUnit.test("converts table tags", assert => {
<thead> <tr><th>Heading 1</th><th>Head 2</th></tr> </thead>
<tbody>
<tr><td>Lorem</td><td>ipsum</td></tr>
<tr><td><b>dolor</b></td> <td><i>sit amet</i></td></tr></tbody>
<tr><td><b>dolor</b></td> <td><i>sit amet</i></td> </tr>
</tbody>
</table>
`;
const markdown = `Discourse Avenue\n\n**laboris**\n\n|Heading 1|Head 2|\n| --- | --- |\n|Lorem|ipsum|\n|**dolor**|_sit amet_|`;
const markdown = `Discourse Avenue\n\n**laboris**\n\n|Heading 1|Head 2|\n| --- | --- |\n|Lorem|ipsum|\n|**dolor**|*sit amet*|`;
assert.equal(toMarkdown(html), markdown);
});
@ -164,11 +168,11 @@ QUnit.test("supporting html tags by keeping them", assert => {
output = `[Lorem <del>ipsum dolor</del> sit](http://example.com).`;
assert.equal(toMarkdown(html), output);
html = `Lorem <del>ipsum \n\n dolor</del> sit.`;
html = `Lorem <del>ipsum dolor</del> sit.`;
assert.equal(toMarkdown(html), html);
html = `Lorem <a href="http://example.com"><del>ipsum \n\n\n dolor</del> sit.</a>`;
output = `Lorem [<del>ipsum \n dolor</del> sit.](http://example.com)`;
output = `Lorem [<del>ipsum dolor</del> sit.](http://example.com)`;
assert.equal(toMarkdown(html), output);
});
@ -223,6 +227,6 @@ QUnit.test("converts ol list tag", assert => {
<li>Item 3</li>
</ol>
`;
const markdown = `Testing\n\n1. Item 1\n2. Item 2\n\n 100. Sub Item 1\n 101. Sub Item 2\n\n * Sub _Sub_ Item 1\n * Sub **Sub** Item 2\n\n3. Item 3`;
const markdown = `Testing\n\n1. Item 1\n2. Item 2\n\n 100. Sub Item 1\n 101. Sub Item 2\n\n * Sub *Sub* Item 1\n * Sub **Sub** Item 2\n\n3. Item 3`;
assert.equal(toMarkdown(html), markdown);
});