FIX: use URI.regexp to find URLs in plain text

This commit is contained in:
Régis Hanol 2019-06-07 01:26:06 +02:00
parent 1f73a3ba6d
commit b1c5ea4289
2 changed files with 30 additions and 21 deletions

View File

@ -3,8 +3,6 @@
class PlainTextToMarkdown class PlainTextToMarkdown
SIGNATURE_SEPARATOR ||= "-- ".freeze SIGNATURE_SEPARATOR ||= "-- ".freeze
URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i
def initialize(plaintext, opts = {}) def initialize(plaintext, opts = {})
@plaintext = plaintext @plaintext = plaintext
@lines = [] @lines = []
@ -150,15 +148,17 @@ class PlainTextToMarkdown
converted_text converted_text
end end
def replace_duplicate_links(text) URL_REGEX ||= URI.regexp(%w{http https ftp mailto})
text.to_enum(:scan, URL_REGEX) BEFORE ||= Regexp.escape(%Q|([<«"“'|)
.map { $& } AFTER ||= Regexp.escape(%Q|)]>»"”'|)
.group_by { |url| url }
.keep_if { |_, urls | urls.length > 1 }
.keys.each do |url|
def replace_duplicate_links(text)
urls = Set.new
text.scan(URL_REGEX) { urls << $& }
urls.each do |url|
escaped = Regexp.escape(url) escaped = Regexp.escape(url)
text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url) text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url)
end end
text text
@ -175,19 +175,20 @@ class PlainTextToMarkdown
end end
def escape_special_characters(text) def escape_special_characters(text)
escaped_text = +"" urls = Set.new
text.scan(URL_REGEX) { urls << $& }
text.split(URL_REGEX).each do |text_part| hoisted = urls
if text_part =~ URL_REGEX .map { |url| [SecureRandom.hex, url] }
# no escaping withing URLs .to_h
escaped_text << text_part
else
# escape Markdown and HTML
text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
escaped_text << CGI.escapeHTML(text_part)
end
end
escaped_text hoisted.each { |h, url| text.gsub!(url, h) }
text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
text = CGI.escapeHTML(text)
hoisted.each { |h, url| text.gsub!(h, url) }
text
end end
end end

View File

@ -177,6 +177,14 @@ describe PlainTextToMarkdown do
expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")) expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz"))
.to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz") .to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")
end end
it "does not explode with weird links" do
expect {
Timeout::timeout(0.25) {
to_markdown("https://www.discourse.org/?boom=#{"." * 20}")
}
}.not_to raise_error
end
end end
context "code" do context "code" do