FIX: use URI.regexp to find URLs in plain text

2019-06-07 01:26:06 +02:00 · 2019-06-07 01:26:06 +02:00 · b1c5ea4289
parent 1f73a3ba6d
commit b1c5ea4289
2 changed files with 30 additions and 21 deletions
--- a/lib/plain_text_to_markdown.rb
+++ b/lib/plain_text_to_markdown.rb
@ -3,8 +3,6 @@
 class PlainTextToMarkdown
  SIGNATURE_SEPARATOR ||= "-- ".freeze
  URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i
  def initialize(plaintext, opts = {})
    @plaintext = plaintext
    @lines = []
@ -150,15 +148,17 @@ class PlainTextToMarkdown
    converted_text
  end
-  def replace_duplicate_links(text)
+  URL_REGEX ||= URI.regexp(%w{http https ftp mailto})
-    text.to_enum(:scan, URL_REGEX)
+  BEFORE ||= Regexp.escape(%Q|([<«"“'‘|)
-      .map { $& }
+  AFTER  ||= Regexp.escape(%Q|)]>»"”'’|)
      .group_by { |url| url }
      .keep_if { |_, urls | urls.length > 1 }
      .keys.each do |url|
  def replace_duplicate_links(text)
    urls = Set.new
    text.scan(URL_REGEX) { urls << $& }
    urls.each do |url|
      escaped = Regexp.escape(url)
-      text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
+      text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url)
    end
    text
@ -175,19 +175,20 @@ class PlainTextToMarkdown
  end
  def escape_special_characters(text)
-    escaped_text = +""
+    urls = Set.new
    text.scan(URL_REGEX) { urls << $& }
-    text.split(URL_REGEX).each do |text_part|
+    hoisted = urls
-      if text_part =~ URL_REGEX
+      .map { |url| [SecureRandom.hex, url] }
-        # no escaping withing URLs
+      .to_h
        escaped_text << text_part
      else
        # escape Markdown and HTML
        text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
        escaped_text << CGI.escapeHTML(text_part)
      end
    end
-    escaped_text
+    hoisted.each { |h, url| text.gsub!(url, h) }
    text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
    text = CGI.escapeHTML(text)
    hoisted.each { |h, url| text.gsub!(h, url) }
    text
  end
 end
--- a/spec/components/plain_text_to_markdown_spec.rb
+++ b/spec/components/plain_text_to_markdown_spec.rb
@ -177,6 +177,14 @@ describe PlainTextToMarkdown do
      expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz"))
        .to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")
    end
    it "does not explode with weird links" do
      expect {
        Timeout::timeout(0.25) {
          to_markdown("https://www.discourse.org/?boom=#{"." * 20}")
        }
      }.not_to raise_error
    end
  end
  context "code" do