Remove unwanted spaces between HTML tags and support Word documents

2017-12-22 09:28:24 +05:30 · 2017-12-22 09:28:24 +05:30 · 4935ae4338
parent 3bc53f2946
commit 4935ae4338
3 changed files with 51 additions and 26 deletions
--- a/app/assets/javascripts/discourse/components/d-editor.js.es6
+++ b/app/assets/javascripts/discourse/components/d-editor.js.es6
@ -662,8 +662,6 @@ export default Ember.Component.extend({
      if (table) {
        this.appEvents.trigger('composer:insert-text', table);
        handled = true;
      } else if (html && html.includes("urn:schemas-microsoft-com:office:word")) {
        html = ""; // use plain text data for microsoft word
      }
    }
--- a/app/assets/javascripts/discourse/lib/to-markdown.js.es6
+++ b/app/assets/javascripts/discourse/lib/to-markdown.js.es6
@ -38,15 +38,15 @@ class Tag {
  }
  static emphases() {
-    return  [ ["b", "**"], ["strong", "**"], ["i", "_"], ["em", "_"], ["s", "~~"], ["strike", "~~"] ];
+    return  [ ["b", "**"], ["strong", "**"], ["i", "*"], ["em", "*"], ["s", "~~"], ["strike", "~~"] ];
  }
  static slices() {
-    return ["dt", "dd", "tr", "thead", "tbody", "tfoot"];
+    return ["dt", "dd", "thead", "tbody", "tfoot"];
  }
  static trimmable() {
-    return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr", "blockquote", "table", "ol"];
+    return [...Tag.blocks(), ...Tag.headings(), ...Tag.slices(), "li", "td", "th", "br", "hr", "blockquote", "table", "ol", "tr"];
  }
  static block(name, prefix, suffix) {
@ -73,14 +73,17 @@ class Tag {
      }
      decorate(text) {
        text = text.trim();
        if (text.includes("\n")) {
          this.prefix = `<${this.name}>`;
          this.suffix = `</${this.name}>`;
        }
-        return super.decorate(text);
+        let space = text.match(/^\s/) || [""];
        this.prefix = space[0] + this.prefix;
        space = text.match(/\s$/) || [""];
        this.suffix = this.suffix + space[0];
        return super.decorate(text.trim());
      }
    };
  }
@ -182,10 +185,6 @@ class Tag {
          throw "Unsupported format inside Markdown table cells";
        }
        if (!this.element.next) {
          this.suffix = "|";
        }
        return this.decorate(text);
      }
    };
@ -268,6 +267,17 @@ class Tag {
    };
  }
  static tr() {
    return class extends Tag.slice("tr", "|\n") {
      decorate(text) {
        if (!this.element.next) {
          this.suffix = "|";
        }
        return `${text}${this.suffix}`;
      }
    };
  }
 }
 const tags = [
@ -278,7 +288,7 @@ const tags = [
  Tag.cell("td"), Tag.cell("th"),
  Tag.replace("br", "\n"), Tag.replace("hr", "\n---\n"), Tag.replace("head", ""),
  Tag.keep("ins"), Tag.keep("del"), Tag.keep("small"), Tag.keep("big"),
-  Tag.li(), Tag.link(), Tag.image(), Tag.code(), Tag.blockquote(), Tag.table(),, Tag.ol(),
+  Tag.li(), Tag.link(), Tag.image(), Tag.code(), Tag.blockquote(), Tag.table(), Tag.ol(), Tag.tr(),
 ];
 class Element {
@ -375,6 +385,19 @@ class Element {
  }
 }
 function trimUnwantedSpaces(html) {
  const body = html.match(/<body[^>]*>([\s\S]*?)<\/body>/);
  html = body ? body[1] : html;
  html = html.replace(/\r|\n|&nbsp;/g, " ");
  let match;
  while (match = html.match(/<[^\s>]+[^>]*>\s{2,}<[^\s>]+[^>]*>/)) {
    html = html.replace(match[0], match[0].replace(/>\s{2,}</, "> <"));
  }
  return html;
 }
 function putPlaceholders(html) {
  const codeRegEx = /<code[^>]*>([\s\S]*?)<\/code>/gi;
  const origHtml = html;
@ -390,7 +413,7 @@ function putPlaceholders(html) {
    match = codeRegEx.exec(origHtml);
  }
-  const elements = parseHTML(html);
+  const elements = parseHTML(trimUnwantedSpaces(html));
  return { elements, placeholders };
 }
@ -406,7 +429,7 @@ export default function toMarkdown(html) {
    const { elements, placeholders } = putPlaceholders(html);
    let markdown = Element.parse(elements).trim();
    markdown = markdown.replace(/^<b>/, "").replace(/<\/b>$/, "").trim(); // fix for google doc copy paste
-    markdown = markdown.replace(/\r/g, "").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
+    markdown = markdown.replace(/ +\n/g, "\n").replace(/\n \n/g, "\n\n").replace(/\n{3,}/g, "\n\n");
    return replacePlaceholders(markdown, placeholders);
  } catch(err) {
    return "";
--- a/test/javascripts/lib/to-markdown-test.js.es6
+++ b/test/javascripts/lib/to-markdown-test.js.es6
@ -4,19 +4,21 @@ QUnit.module("lib:to-markdown");
 QUnit.test("converts styles between normal words", assert => {
  const html = `Line with <s>styles</s> <b><i>between</i></b> words.`;
-  const markdown = `Line with ~~styles~~ **_between_** words.`;
+  const markdown = `Line with ~~styles~~ ***between*** words.`;
  assert.equal(toMarkdown(html), markdown);
  assert.equal(toMarkdown("A <b>bold </b>word"), "A **bold** word");
 });
 QUnit.test("converts inline nested styles", assert => {
  let html = `<em>Italicised line with <strong>some random</strong> <b>bold</b> words.</em>`;
-  let markdown = `_Italicised line with **some random** **bold** words._`;
+  let markdown = `*Italicised line with **some random** **bold** words.*`;
  assert.equal(toMarkdown(html), markdown);
  html = `<i class="fa">Italicised line
-   with <b title="strong">some
+   with <b title="strong">some<br>
   random</b> <s>bold</s> words.</i>`;
-  markdown = `<i>Italicised line\n with <b>some\n random</b> ~~bold~~ words.</i>`;
+  markdown = `<i>Italicised line with <b>some\nrandom</b> ~~bold~~ words.</i>`;
  assert.equal(toMarkdown(html), markdown);
 });
@ -26,7 +28,7 @@ QUnit.test("converts a link", assert => {
  assert.equal(toMarkdown(html), markdown);
  html = `<a href="https://discourse.org">Disc\n\n\nour\n\nse</a>`;
-  markdown = `[Disc\nour\nse](https://discourse.org)`;
+  markdown = `[Disc our se](https://discourse.org)`;
  assert.equal(toMarkdown(html), markdown);
 });
@ -82,7 +84,7 @@ QUnit.test("converts ul list tag", assert => {
    <li>Item 3</li>
  </ul>
  `;
-  const markdown = `* Item 1\n* Item 2\n\n  * Sub Item 1\n  * Sub Item 2\n\n    * Sub _Sub_ Item 1\n    * Sub **Sub** Item 2\n\n* Item 3`;
+  const markdown = `* Item 1\n* Item 2\n\n  * Sub Item 1\n  * Sub Item 2\n\n    * Sub *Sub* Item 1\n    * Sub **Sub** Item 2\n\n* Item 3`;
  assert.equal(toMarkdown(html), markdown);
 });
@ -101,10 +103,12 @@ QUnit.test("converts table tags", assert => {
    <thead> <tr><th>Heading 1</th><th>Head 2</th></tr> </thead>
      <tbody>
        <tr><td>Lorem</td><td>ipsum</td></tr>
-        <tr><td><b>dolor</b></td> <td><i>sit amet</i></td></tr></tbody>
+        <tr><td><b>dolor</b></td> <td><i>sit amet</i></td> </tr>
        </tbody>
 </table>
  `;
-  const markdown = `Discourse Avenue\n\n**laboris**\n\n|Heading 1|Head 2|\n| --- | --- |\n|Lorem|ipsum|\n|**dolor**|_sit amet_|`;
+  const markdown = `Discourse Avenue\n\n**laboris**\n\n|Heading 1|Head 2|\n| --- | --- |\n|Lorem|ipsum|\n|**dolor**|*sit amet*|`;
  assert.equal(toMarkdown(html), markdown);
 });
@ -164,11 +168,11 @@ QUnit.test("supporting html tags by keeping them", assert => {
  output = `[Lorem <del>ipsum dolor</del> sit](http://example.com).`;
  assert.equal(toMarkdown(html), output);
-  html = `Lorem <del>ipsum \n\n dolor</del> sit.`;
+  html = `Lorem <del>ipsum dolor</del> sit.`;
  assert.equal(toMarkdown(html), html);
  html = `Lorem <a href="http://example.com"><del>ipsum \n\n\n dolor</del> sit.</a>`;
-  output = `Lorem [<del>ipsum \n dolor</del> sit.](http://example.com)`;
+  output = `Lorem [<del>ipsum dolor</del> sit.](http://example.com)`;
  assert.equal(toMarkdown(html), output);
 });
@ -223,6 +227,6 @@ QUnit.test("converts ol list tag", assert => {
    <li>Item 3</li>
  </ol>
  `;
-  const markdown = `Testing\n\n1. Item 1\n2. Item 2\n\n  100. Sub Item 1\n  101. Sub Item 2\n\n    * Sub _Sub_ Item 1\n    * Sub **Sub** Item 2\n\n3. Item 3`;
+  const markdown = `Testing\n\n1. Item 1\n2. Item 2\n\n  100. Sub Item 1\n  101. Sub Item 2\n\n    * Sub *Sub* Item 1\n    * Sub **Sub** Item 2\n\n3. Item 3`;
  assert.equal(toMarkdown(html), markdown);
 });