From b1c5ea42896193a6940cbf66f614596fa546d906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Fri, 7 Jun 2019 01:26:06 +0200 Subject: [PATCH] FIX: use URI.regexp to find URLs in plain text --- lib/plain_text_to_markdown.rb | 43 ++++++++++--------- .../components/plain_text_to_markdown_spec.rb | 8 ++++ 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/lib/plain_text_to_markdown.rb b/lib/plain_text_to_markdown.rb index 33aa3c3d7d6..f365731130b 100644 --- a/lib/plain_text_to_markdown.rb +++ b/lib/plain_text_to_markdown.rb @@ -3,8 +3,6 @@ class PlainTextToMarkdown SIGNATURE_SEPARATOR ||= "-- ".freeze - URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i - def initialize(plaintext, opts = {}) @plaintext = plaintext @lines = [] @@ -150,15 +148,17 @@ class PlainTextToMarkdown converted_text end - def replace_duplicate_links(text) - text.to_enum(:scan, URL_REGEX) - .map { $& } - .group_by { |url| url } - .keep_if { |_, urls | urls.length > 1 } - .keys.each do |url| + URL_REGEX ||= URI.regexp(%w{http https ftp mailto}) + BEFORE ||= Regexp.escape(%Q|([<«"“'‘|) + AFTER ||= Regexp.escape(%Q|)]>»"”'’|) + def replace_duplicate_links(text) + urls = Set.new + text.scan(URL_REGEX) { urls << $& } + + urls.each do |url| escaped = Regexp.escape(url) - text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url) + text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url) end text @@ -175,19 +175,20 @@ class PlainTextToMarkdown end def escape_special_characters(text) - escaped_text = +"" + urls = Set.new + text.scan(URL_REGEX) { urls << $& } - text.split(URL_REGEX).each do |text_part| - if text_part =~ URL_REGEX - # no escaping withing URLs - escaped_text << text_part - else - # escape Markdown and HTML - text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" } - escaped_text << CGI.escapeHTML(text_part) - end - end + hoisted = urls + .map { |url| [SecureRandom.hex, url] } + .to_h - escaped_text + hoisted.each { |h, url| text.gsub!(url, h) } + + text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" } + text = CGI.escapeHTML(text) + + hoisted.each { |h, url| text.gsub!(h, url) } + + text end end diff --git a/spec/components/plain_text_to_markdown_spec.rb b/spec/components/plain_text_to_markdown_spec.rb index 724b3c14f7a..6cdbff949bb 100644 --- a/spec/components/plain_text_to_markdown_spec.rb +++ b/spec/components/plain_text_to_markdown_spec.rb @@ -177,6 +177,14 @@ describe PlainTextToMarkdown do expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")) .to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz") end + + it "does not explode with weird links" do + expect { + Timeout::timeout(0.25) { + to_markdown("https://www.discourse.org/?boom=#{"." * 20}") + } + }.not_to raise_error + end end context "code" do