From b1c5ea42896193a6940cbf66f614596fa546d906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Hanol?= <regis@hanol.fr>
Date: Fri, 7 Jun 2019 01:26:06 +0200
Subject: [PATCH] FIX: use URI.regexp to find URLs in plain text

---
 lib/plain_text_to_markdown.rb                 | 43 ++++++++++---------
 .../components/plain_text_to_markdown_spec.rb |  8 ++++
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/lib/plain_text_to_markdown.rb b/lib/plain_text_to_markdown.rb
index 33aa3c3d7d6..f365731130b 100644
--- a/lib/plain_text_to_markdown.rb
+++ b/lib/plain_text_to_markdown.rb
@@ -3,8 +3,6 @@
 class PlainTextToMarkdown
   SIGNATURE_SEPARATOR ||= "-- ".freeze
 
-  URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i
-
   def initialize(plaintext, opts = {})
     @plaintext = plaintext
     @lines = []
@@ -150,15 +148,17 @@ class PlainTextToMarkdown
     converted_text
   end
 
-  def replace_duplicate_links(text)
-    text.to_enum(:scan, URL_REGEX)
-      .map { $& }
-      .group_by { |url| url }
-      .keep_if { |_, urls | urls.length > 1 }
-      .keys.each do |url|
+  URL_REGEX ||= URI.regexp(%w{http https ftp mailto})
+  BEFORE ||= Regexp.escape(%Q|([<«"“'‘|)
+  AFTER  ||= Regexp.escape(%Q|)]>»"”'’|)
 
+  def replace_duplicate_links(text)
+    urls = Set.new
+    text.scan(URL_REGEX) { urls << $& }
+
+    urls.each do |url|
       escaped = Regexp.escape(url)
-      text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
+      text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url)
     end
 
     text
@@ -175,19 +175,20 @@ class PlainTextToMarkdown
   end
 
   def escape_special_characters(text)
-    escaped_text = +""
+    urls = Set.new
+    text.scan(URL_REGEX) { urls << $& }
 
-    text.split(URL_REGEX).each do |text_part|
-      if text_part =~ URL_REGEX
-        # no escaping withing URLs
-        escaped_text << text_part
-      else
-        # escape Markdown and HTML
-        text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
-        escaped_text << CGI.escapeHTML(text_part)
-      end
-    end
+    hoisted = urls
+      .map { |url| [SecureRandom.hex, url] }
+      .to_h
 
-    escaped_text
+    hoisted.each { |h, url| text.gsub!(url, h) }
+
+    text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
+    text = CGI.escapeHTML(text)
+
+    hoisted.each { |h, url| text.gsub!(h, url) }
+
+    text
   end
 end
diff --git a/spec/components/plain_text_to_markdown_spec.rb b/spec/components/plain_text_to_markdown_spec.rb
index 724b3c14f7a..6cdbff949bb 100644
--- a/spec/components/plain_text_to_markdown_spec.rb
+++ b/spec/components/plain_text_to_markdown_spec.rb
@@ -177,6 +177,14 @@ describe PlainTextToMarkdown do
       expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz"))
         .to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")
     end
+
+    it "does not explode with weird links" do
+      expect {
+        Timeout::timeout(0.25) {
+          to_markdown("https://www.discourse.org/?boom=#{"." * 20}")
+        }
+      }.not_to raise_error
+    end
   end
 
   context "code" do