FIX: use URI.regexp to find URLs in plain text
This commit is contained in:
parent
1f73a3ba6d
commit
b1c5ea4289
|
@ -3,8 +3,6 @@
|
||||||
class PlainTextToMarkdown
|
class PlainTextToMarkdown
|
||||||
SIGNATURE_SEPARATOR ||= "-- ".freeze
|
SIGNATURE_SEPARATOR ||= "-- ".freeze
|
||||||
|
|
||||||
URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i
|
|
||||||
|
|
||||||
def initialize(plaintext, opts = {})
|
def initialize(plaintext, opts = {})
|
||||||
@plaintext = plaintext
|
@plaintext = plaintext
|
||||||
@lines = []
|
@lines = []
|
||||||
|
@ -150,15 +148,17 @@ class PlainTextToMarkdown
|
||||||
converted_text
|
converted_text
|
||||||
end
|
end
|
||||||
|
|
||||||
def replace_duplicate_links(text)
|
URL_REGEX ||= URI.regexp(%w{http https ftp mailto})
|
||||||
text.to_enum(:scan, URL_REGEX)
|
BEFORE ||= Regexp.escape(%Q|([<«"“'‘|)
|
||||||
.map { $& }
|
AFTER ||= Regexp.escape(%Q|)]>»"”'’|)
|
||||||
.group_by { |url| url }
|
|
||||||
.keep_if { |_, urls | urls.length > 1 }
|
|
||||||
.keys.each do |url|
|
|
||||||
|
|
||||||
|
def replace_duplicate_links(text)
|
||||||
|
urls = Set.new
|
||||||
|
text.scan(URL_REGEX) { urls << $& }
|
||||||
|
|
||||||
|
urls.each do |url|
|
||||||
escaped = Regexp.escape(url)
|
escaped = Regexp.escape(url)
|
||||||
text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
|
text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url)
|
||||||
end
|
end
|
||||||
|
|
||||||
text
|
text
|
||||||
|
@ -175,19 +175,20 @@ class PlainTextToMarkdown
|
||||||
end
|
end
|
||||||
|
|
||||||
def escape_special_characters(text)
|
def escape_special_characters(text)
|
||||||
escaped_text = +""
|
urls = Set.new
|
||||||
|
text.scan(URL_REGEX) { urls << $& }
|
||||||
|
|
||||||
text.split(URL_REGEX).each do |text_part|
|
hoisted = urls
|
||||||
if text_part =~ URL_REGEX
|
.map { |url| [SecureRandom.hex, url] }
|
||||||
# no escaping withing URLs
|
.to_h
|
||||||
escaped_text << text_part
|
|
||||||
else
|
|
||||||
# escape Markdown and HTML
|
|
||||||
text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
|
|
||||||
escaped_text << CGI.escapeHTML(text_part)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
escaped_text
|
hoisted.each { |h, url| text.gsub!(url, h) }
|
||||||
|
|
||||||
|
text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
|
||||||
|
text = CGI.escapeHTML(text)
|
||||||
|
|
||||||
|
hoisted.each { |h, url| text.gsub!(h, url) }
|
||||||
|
|
||||||
|
text
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -177,6 +177,14 @@ describe PlainTextToMarkdown do
|
||||||
expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz"))
|
expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz"))
|
||||||
.to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")
|
.to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "does not explode with weird links" do
|
||||||
|
expect {
|
||||||
|
Timeout::timeout(0.25) {
|
||||||
|
to_markdown("https://www.discourse.org/?boom=#{"." * 20}")
|
||||||
|
}
|
||||||
|
}.not_to raise_error
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context "code" do
|
context "code" do
|
||||||
|
|
Loading…
Reference in New Issue