DEV: Switch `InlineUploads` to a regexp based implementation.

This commit is contained in:
Guo Xiang Tan 2019-06-03 15:41:26 +08:00 committed by Guo Xiang Tan
parent d93e5fb00d
commit 1991af2abb
8 changed files with 369 additions and 134 deletions

View File

@ -151,7 +151,6 @@ group :development do
gem 'bullet', require: !!ENV['BULLET']
gem 'better_errors'
gem 'binding_of_caller'
gem 'diffy'
# waiting on 2.7.5 per: https://github.com/ctran/annotate_models/pull/595
if rails_master?

View File

@ -90,7 +90,6 @@ GEM
crass (1.0.4)
debug_inspector (0.0.3)
diff-lcs (1.3)
diffy (3.3.0)
discourse-ember-source (3.8.0.1)
discourse_image_optim (0.26.2)
exifr (~> 1.2, >= 1.2.2)
@ -436,7 +435,6 @@ DEPENDENCIES
certified
colored2
cppjieba_rb
diffy
discourse-ember-source (~> 3.8.0)
discourse_image_optim
email_reply_trimmer (~> 0.1)

View File

@ -236,7 +236,7 @@ function applyEmoji(
export function setup(helper) {
helper.registerOptions((opts, siteSettings, state) => {
opts.features.emoji = !!siteSettings.enable_emoji;
opts.features.emoji = !state.disableEmojis && !!siteSettings.enable_emoji;
opts.features.emojiShortcuts = !!siteSettings.enable_emoji_shortcuts;
opts.features.inlineEmoji = !!siteSettings.enable_inline_emoji_translation;
opts.emojiSet = siteSettings.emoji_set || "";

View File

@ -29,7 +29,8 @@ export function buildOptions(state) {
lookupUploadUrls,
previewing,
linkify,
censoredWords
censoredWords,
disableEmojis
} = state;
let features = {
@ -76,7 +77,8 @@ export function buildOptions(state) {
markdownIt: true,
injectLineNumbersToPreview:
siteSettings.enable_advanced_editor_preview_sync,
previewing
previewing,
disableEmojis
};
// note, this will mutate options due to the way the API is designed

View File

@ -3,98 +3,147 @@
require_dependency "pretty_text"
class InlineUploads
PLACEHOLDER = "__replace__"
private_constant :PLACEHOLDER
UPLOAD_REGEXP_PATTERN = "/original/(\\dX/(?:[a-f0-9]/)*[a-f0-9]{40}[a-z0-9.]*)"
private_constant :UPLOAD_REGEXP_PATTERN
def self.process(markdown, on_missing: nil)
markdown = markdown.dup
cooked_fragment = Nokogiri::HTML::fragment(PrettyText.cook(markdown))
cooked_fragment = Nokogiri::HTML::fragment(PrettyText.cook(markdown, disable_emojis: true))
link_occurences = []
cooked_fragment.traverse do |node|
if node.name == "img"
# Do nothing
elsif !(node.children.count == 1 && (node.children[0].name != "img" && node.children[0].children.blank?))
elsif !(node.children.count == 1 && (node.children[0].name != "img" && node.children[0].children.blank?)) ||
!node.ancestors.all? { |parent| !parent.attributes["class"]&.value&.include?("quote") }
next
end
if seen_link = matched_uploads(node).first
if actual_link = (node.attributes["href"]&.value || node.attributes["src"]&.value)
link_occurences << [actual_link, true]
else
link_occurences << [seen_link, false]
link_occurences <<
if (actual_link = (node.attributes["href"]&.value || node.attributes["src"]&.value))
{ link: actual_link, is_valid: true }
else
{ link: seen_link, is_valid: false }
end
end
end
raw_matches = []
markdown.scan(/(\[img\]\s?(.+)\s?\[\/img\])/) do |match|
raw_matches << [match[0], match[1], +"![](#{PLACEHOLDER})", $~.offset(0)[0]]
end
markdown.scan(/(!?\[([^\[\]]+)\]\(([a-zA-z0-9\.\/:-]+)\))/) do |match|
if matched_uploads(match[2]).present?
raw_matches << [
match[0],
match[2],
+"#{match[0].start_with?("!") ? "!" : ""}[#{match[1]}](#{PLACEHOLDER})",
$~.offset(0)[0]
]
end
end
markdown.scan(/(<(?!img)[^<>]+\/?>)?(\n*)(([ ]*)<img ([^<>]+)>([ ]*))(\n*)/) do |match|
node = Nokogiri::HTML::fragment(match[2].strip).children[0]
src = node.attributes["src"].value
if matched_uploads(src).present?
text = node.attributes["alt"]&.value
width = node.attributes["width"]&.value
height = node.attributes["height"]&.value
text = "#{text}|#{width}x#{height}" if width && height
after_html_tag = match[0].present?
spaces_before =
if after_html_tag && !match[0].end_with?("/>")
(match[3].present? ? match[3] : " ")
else
""
end
replacement = +"#{spaces_before}![#{text}](#{PLACEHOLDER})"
if after_html_tag && (num_newlines = match[1].length) <= 1
replacement.prepend("\n" * (num_newlines == 0 ? 2 : 1))
end
if after_html_tag && !match[0].end_with?("/>") && (num_newlines = match[6].length) <= 1
replacement += ("\n" * (num_newlines == 0 ? 2 : 1))
end
match[2].strip! if !after_html_tag
raw_matches << [
match[2],
src,
replacement,
$~.offset(0)[0]
]
end
end
markdown.scan(/((<a[^<]+>)([^<\a>]*?)<\/a>)/) do |match|
node = Nokogiri::HTML::fragment(match[0]).children[0]
href = node.attributes["href"]&.value
if href && matched_uploads(href).present?
has_attachment = node.attributes["class"]&.value
index = $~.offset(0)[0]
text = match[2].strip.gsub("\n", "").gsub(/ +/, " ")
text = "#{text}|attachment" if has_attachment
raw_matches << [match[0], href, +"[#{text}](#{PLACEHOLDER})", index]
end
end
db = RailsMultisite::ConnectionManagement.current_db
regexps = [
/(^|\s)?(https?:\/\/[a-zA-Z0-9\.\/-]+\/uploads\/#{db}#{UPLOAD_REGEXP_PATTERN})($|\s)/,
]
if Discourse.store.external?
regexps << /(^|\s)?(https?:#{SiteSetting.Upload.s3_base_url}#{UPLOAD_REGEXP_PATTERN})($|\s)?/
regexps << /(^|\s)?(#{SiteSetting.Upload.s3_cdn_url}#{UPLOAD_REGEXP_PATTERN})($|\s)?/
end
regexps.each do |regexp|
markdown.scan(regexp) do |match|
if matched_uploads(match[1]).present?
raw_matches << [match[1], match[1], +"![](#{PLACEHOLDER})", $~.offset(0)[0]]
end
end
end
raw_fragment = Nokogiri::HTML::fragment(markdown)
raw_matches
.sort { |a, b| a[3] <=> b[3] }
.each do |match, link, replace_with, _index|
raw_fragment.traverse do |node|
if node.name == "img"
# Do nothing
elsif !(node.children.count == 0 || (node.children.count == 1 && node.children[0].children.blank?))
next
end
node_info = link_occurences.shift
next unless node_info&.dig(:is_valid)
matches = matched_uploads(node)
next if matches.blank?
links = extract_links(node)
if link.include?(node_info[:link])
begin
uri = URI(link)
rescue URI::Error
end
matches.zip(links).each do |_match, link|
seen_link, is_valid = link_occurences.shift
next unless (link && is_valid)
if !Discourse.store.external?
next if uri&.host && uri.host != Discourse.current_hostname
end
if link.include?(seen_link)
begin
uri = URI(link)
rescue URI::Error
end
upload = Upload.get_from_url(link)
if !Discourse.store.external?
next if uri&.host && uri.host != Discourse.current_hostname
end
upload = Upload.get_from_url(link)
if upload
new_node =
case node.name
when 'a'
attachment_postfix =
if node.attributes["class"]&.value&.split(" ")&.include?("attachment")
"|attachment"
else
""
end
text = node.children.text.strip.gsub("\n", "").gsub(/ +/, " ")
markdown.sub!(
node.to_s,
"[#{text}#{attachment_postfix}](#{upload.short_url})"
)
when "img"
text = node.attributes["alt"]&.value
width = node.attributes["width"]&.value
height = node.attributes["height"]&.value
text = "#{text}|#{width}x#{height}" if width && height
markdown.sub!(node.to_s, "![#{text}](#{upload.short_url})")
else
if markdown =~ /\[img\]\s?#{link}\s?\[\/img\]/
capture = Regexp.last_match[0]
if capture
markdown.sub!(capture, "![](#{upload.short_url})")
end
elsif markdown =~ /(!?\[([a-z0-9|]+)\]\([a-zA-z0-9\.\/]+\))/
capture = Regexp.last_match[0]
if capture
markdown.sub!(capture, "![#{Regexp.last_match[2]}](#{upload.short_url})")
end
end
end
else
on_missing.call(link) if on_missing
end
if upload
replacement = replace_with.sub!(PLACEHOLDER, upload.short_url)
markdown.sub!(match, replacement)
else
on_missing.call(link) if on_missing
end
end
end
@ -114,21 +163,21 @@ class InlineUploads
if Discourse.store.external?
if Rails.configuration.multisite
regexps << /(#{SiteSetting.Upload.s3_base_url}\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(#{SiteSetting.Upload.s3_base_url}\/uploads\/#{db}#{UPLOAD_REGEXP_PATTERN})/
regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/uploads\/#{db}#{UPLOAD_REGEXP_PATTERN})/
else
regexps << /(#{SiteSetting.Upload.s3_base_url}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(#{SiteSetting.Upload.s3_cdn_url}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(#{SiteSetting.Upload.s3_base_url}#{UPLOAD_REGEXP_PATTERN})/
regexps << /(#{SiteSetting.Upload.s3_cdn_url}#{UPLOAD_REGEXP_PATTERN})/
regexps << /(\/uploads\/#{db}#{UPLOAD_REGEXP_PATTERN})/
end
else
regexps << /(\/uploads\/#{db}\/original\/(\dX\/(?:[a-f0-9]\/)*[a-f0-9]{40}[a-z0-9\.]*))/
regexps << /(\/uploads\/#{db}#{UPLOAD_REGEXP_PATTERN})/
end
node = node.to_s
regexps.each do |regexp|
node.scan(regexp).each do |matched|
node.scan(regexp) do |matched|
matches << matched[0]
end
end
@ -136,16 +185,4 @@ class InlineUploads
matches
end
private_class_method :matched_uploads
def self.extract_links(node)
links = []
links << node.attributes["href"]&.value
links << node.attributes["src"]&.value
links = links.concat(node.to_s.scan(/\[img\]\s?(.+)\s?\[\/img\]/))
links = links.concat(node.to_s.scan(/!?\[[a-z0-9|]+\]\(([a-zA-z0-9\.\/]+)\)/))
links.flatten!
links.compact!
links
end
private_class_method :extract_links
end

View File

@ -149,6 +149,7 @@ module PrettyText
buffer = +<<~JS
__optInput = {};
__optInput.siteSettings = #{SiteSetting.client_settings_json};
#{"__optInput.disableEmojis = true" if opts[:disable_emojis]}
__paths = #{paths_json};
__optInput.getURL = __getURL;
__optInput.getCurrentUser = __getCurrentUser;

View File

@ -652,7 +652,7 @@ end
desc "Coverts full upload URLs in `Post#raw` to short upload url"
task 'posts:inline_uploads' => :environment do |_, args|
dry_run = ENV["DRY_RUN"] || true
dry_run = (ENV["DRY_RUN"].nil? ? true : ENV["DRY_RUN"] != "false")
scope = Post.joins(:post_uploads)
.distinct("posts.id")
@ -661,32 +661,54 @@ task 'posts:inline_uploads' => :environment do |_, args|
affected_posts_count = scope.count
fixed_count = 0
not_corrected_post_ids = []
failed_to_correct_post_ids = []
scope.find_each do |post|
new_raw = InlineUploads.process(post.raw)
if post.raw !~ Upload::URL_REGEX
affected_posts_count -= 1
next
end
if post.raw != new_raw
if dry_run
puts "Post id #{post.id} raw changed!"
Diffy::Diff.default_format = :color
puts Diffy::Diff.new(post.raw, new_raw, context: 1)
begin
new_raw = InlineUploads.process(post.raw)
if post.raw != new_raw
if dry_run
putc "🏃‍"
else
post.revise!(Discourse.system_user,
{
raw: new_raw
},
skip_validations: true,
force_new_version: true
)
putc "."
end
fixed_count += 1
else
putc "."
not_corrected_post_ids << post.id
end
fixed_count += 1
else
not_corrected_post_ids << post.id
rescue => e
putc "X"
failed_to_correct_post_ids << post.id
end
end
puts
puts "#{fixed_count} out of #{affected_posts_count} affected posts corrected"
if fixed_count != affected_posts_count
puts "Ids of posts that were not correct: #{not_corrected_post_ids}"
if not_corrected_post_ids.present?
puts "Ids of posts that were not corrected: #{not_corrected_post_ids}"
end
if failed_to_correct_post_ids.present?
puts "Ids of posts that encountered failures: #{failed_to_correct_post_ids}"
end
if dry_run
puts "Task was ran in dry run mode. Set `DRY_RUN=false` to revise affected posts"
end
end

View File

@ -54,6 +54,21 @@ RSpec.describe InlineUploads do
expect(InlineUploads.process(md)).to eq(md)
end
it "should not correct links in quotes" do
post = Fabricate(:post)
user = Fabricate(:user)
md = <<~MD
[quote="#{user.username}, post:#{post.post_number}, topic:#{post.topic.id}"]
some quote
#{Discourse.base_url}#{upload3.url}
[/quote]
MD
expect(InlineUploads.process(md)).to eq(md)
end
it "should correct bbcode img URLs to the short version" do
md = <<~MD
[img]#{upload.url}[/img]
@ -70,15 +85,29 @@ RSpec.describe InlineUploads do
MD
end
it "should correct raw image URLs to the short version" do
md = <<~MD
#{Discourse.base_url}#{upload3.url} #{Discourse.base_url}#{upload3.url}
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![](#{upload3.short_url}) ![](#{upload3.short_url})
MD
end
it "should correct image URLs to the short version" do
md = <<~MD
![image|690x290](#{upload.short_url})
![image](#{upload.url})
![image|100x100](#{upload.url})
![IMAge|690x190,60%](#{upload.short_url})
<img src="#{Discourse.base_url}#{upload.url}" alt="some image">
<img src="#{Discourse.base_url}#{upload.url}" alt="some image"><img src="#{Discourse.base_url}#{upload.url}" alt="some image">
![image](#{upload2.url})
![image|100x100](#{upload3.url})
<img src="#{Discourse.base_url}#{upload.url}" alt="some image" />
<img src="#{Discourse.base_url}#{upload2.url}" alt="some image"><img src="#{Discourse.base_url}#{upload3.url}" alt="some image">
#{Discourse.base_url}#{upload3.url} #{Discourse.base_url}#{upload3.url}
<img src="#{upload.url}" width="5" height="4">
MD
@ -86,25 +115,165 @@ RSpec.describe InlineUploads do
expect(InlineUploads.process(md)).to eq(<<~MD)
![image|690x290](#{upload.short_url})
![image](#{upload.short_url})
![image|100x100](#{upload.short_url})
![IMAge|690x190,60%](#{upload.short_url})
![image](#{upload2.short_url})
![image|100x100](#{upload3.short_url})
![some image](#{upload.short_url})
![some image](#{upload.short_url})![some image](#{upload.short_url})
![some image](#{upload2.short_url})![some image](#{upload3.short_url})
![](#{upload3.short_url}) ![](#{upload3.short_url})
![|5x4](#{upload.short_url})
MD
end
it "should correct attachment URLS with an upload before" do
it "should not be affected by an emoji" do
CustomEmoji.create!(name: 'test', upload: upload3)
Emoji.clear_cache
md = <<~MD
![image](#{upload.short_url})
:test:
![image|690x290](#{upload.url})
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
:test:
![image|690x290](#{upload.short_url})
MD
end
it "should correctly update image sources within anchor or paragraph tags" do
md = <<~MD
<a href="http://somelink.com">
<img src="#{upload.url}" alt="test" width="500" height="500">
</a>
<p>
<img src="#{upload2.url}" alt="test">
</p>
<a href="http://somelink.com"><img src="#{upload3.url}" alt="test" width="500" height="500"></a>
<a href="http://somelink.com"> <img src="#{upload.url}" alt="test" width="500" height="500"> </a>
<a href="http://somelink.com">
<img src="#{upload.url}" alt="test" width="500" height="500">
</a>
<p>Test <img src="#{upload2.url}" alt="test" width="500" height="500"></p>
<hr/>
<img src="#{upload2.url}" alt="test" width="500" height="500">
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
<a href="http://somelink.com">
![test|500x500](#{upload.short_url})
</a>
<p>
![test](#{upload2.short_url})
</p>
<a href="http://somelink.com">
![test|500x500](#{upload3.short_url})
</a>
<a href="http://somelink.com">
![test|500x500](#{upload.short_url})
</a>
<a href="http://somelink.com">
![test|500x500](#{upload.short_url})
</a>
<p>Test ![test|500x500](#{upload2.short_url})</p>
<hr/>
![test|500x500](#{upload2.short_url})
MD
end
it "should not be affected by fake HTML tags" do
md = <<~MD
```
This is some <img src=" and <a href="
```
<img src="#{upload.url}" alt="test">
> some quote
<a class="attachment" href="#{upload2.url}">test2</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![image](#{upload.short_url})
```
This is some <img src=" and <a href="
```
![test](#{upload.short_url})
> some quote
[test2|attachment](#{upload2.short_url})
MD
end
it "should not be affected by an external or invalid links" do
md = <<~MD
<a id="test">invalid</a>
[test]("https://this.is.some.external.link")
<a href="https://some.external.com/link">test</a>
<a class="attachment" href="#{upload2.url}">test2</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
<a id="test">invalid</a>
[test]("https://this.is.some.external.link")
<a href="https://some.external.com/link">test</a>
[test2|attachment](#{upload2.short_url})
MD
end
it "should correct attachment URLS to the short version when raw contains inline image" do
md = <<~MD
![image](#{upload.short_url}) ![image](#{upload.short_url})
[some complicated.doc %50](#{upload3.url})
<a class="attachment" href="#{upload2.url}">test2</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![image](#{upload.short_url}) ![image](#{upload.short_url})
[some complicated.doc %50](#{upload3.short_url})
[test2|attachment](#{upload2.short_url})
MD
@ -120,23 +289,23 @@ RSpec.describe InlineUploads do
</a>
- <a class="attachment" href="#{upload2.url}">test2</a>
- <a class="attachment" href="#{upload.url}">test2</a>
- <a class="attachment" href="#{upload2.url}">test2</a>
- <a class="attachment" href="#{upload2.url}">test2</a>
- <a class="attachment" href="#{upload3.url}">test2</a>
<a class="test attachment" href="#{upload3.url}">test3</a>
<a class="test attachment" href="#{upload3.url}">test3</a><a class="test attachment" href="#{upload3.url}">test3</a>
<a class="test attachment" href="#{upload.url}">test3</a>
<a class="test attachment" href="#{upload2.url}">test3</a><a class="test attachment" href="#{upload3.url}">test3</a>
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
[this is some attachment|attachment](#{upload.short_url})
- [test2|attachment](#{upload2.short_url})
- [test2|attachment](#{upload.short_url})
- [test2|attachment](#{upload2.short_url})
- [test2|attachment](#{upload2.short_url})
- [test2|attachment](#{upload3.short_url})
[test3|attachment](#{upload3.short_url})
[test3|attachment](#{upload3.short_url})[test3|attachment](#{upload3.short_url})
[test3|attachment](#{upload.short_url})
[test3|attachment](#{upload2.short_url})[test3|attachment](#{upload3.short_url})
MD
end
@ -200,6 +369,7 @@ RSpec.describe InlineUploads do
describe "s3 uploads" do
let(:upload) { Fabricate(:upload_s3) }
let(:upload2) { Fabricate(:upload_s3) }
before do
SiteSetting.enable_s3_uploads = true
@ -212,12 +382,12 @@ RSpec.describe InlineUploads do
it "should correct image URLs to the short version" do
md = <<~MD
<img src="#{upload.url}" alt="some image">
<img src="#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload.url).path).to_s}" alt="some image">
<img src="#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload2.url).path).to_s}" alt="some image">
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![some image](#{upload.short_url})
![some image](#{upload.short_url})
![some image](#{upload2.short_url})
MD
end
@ -226,13 +396,19 @@ RSpec.describe InlineUploads do
Rails.configuration.multisite = true
md = <<~MD
https:#{upload2.url} https:#{upload2.url}
#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload2.url).path).to_s}
<img src="#{upload.url}" alt="some image">
<img src="#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload.url).path).to_s}" alt="some image">
<img src="#{URI.join(SiteSetting.s3_cdn_url, URI.parse(upload2.url).path).to_s}" alt="some image">
MD
expect(InlineUploads.process(md)).to eq(<<~MD)
![](#{upload2.short_url}) ![](#{upload2.short_url})
![](#{upload2.short_url})
![some image](#{upload.short_url})
![some image](#{upload.short_url})
![some image](#{upload2.short_url})
MD
ensure
Rails.configuration.multisite = false