PERF: avoid race conditions when creating topic links
Previously the code was very race condition prone leading to odd failures in production It was re-written in raw SQL to avoid conditions where rows conflict on inserts There is no clean way in ActiveRecord to do: Insert, on conflict do nothing and return existing id. This also increases test coverage, we were previously not testing the code responsible for crawling external sites directly
This commit is contained in:
parent
ca8d11635c
commit
7f841dc21f
|
@ -140,9 +140,12 @@ class TopicLink < ActiveRecord::Base
|
|||
|
||||
end
|
||||
|
||||
# Crawl a link's title after it's saved
|
||||
def self.crawl_link_title(topic_link_id)
|
||||
Jobs.enqueue(:crawl_topic_link, topic_link_id: topic_link_id)
|
||||
end
|
||||
|
||||
def crawl_link_title
|
||||
Jobs.enqueue(:crawl_topic_link, topic_link_id: id)
|
||||
TopicLink.crawl_link_title(id)
|
||||
end
|
||||
|
||||
def self.duplicate_lookup(topic)
|
||||
|
@ -167,6 +170,97 @@ class TopicLink < ActiveRecord::Base
|
|||
|
||||
private
|
||||
|
||||
# This pattern is used to create topic links very efficiently with minimal
|
||||
# errors under heavy concurrent use
|
||||
#
|
||||
# It avoids a SELECT to find out if the record is there and minimizes all
|
||||
# the work it needs to do in case a record is missing
|
||||
#
|
||||
# It handles calling the required callback and has parity with Rails implementation
|
||||
#
|
||||
# Usually we would rely on ActiveRecord but in this case we have had lots of churn
|
||||
# around creation of topic links leading to hard to debug log messages in production
|
||||
#
|
||||
def self.safe_create_topic_link(
|
||||
post_id:,
|
||||
user_id:,
|
||||
topic_id:,
|
||||
url:,
|
||||
domain: nil,
|
||||
internal: false,
|
||||
link_topic_id: nil,
|
||||
link_post_id: nil,
|
||||
quote: false,
|
||||
extension: nil,
|
||||
reflection: false
|
||||
)
|
||||
|
||||
domain ||= Discourse.current_hostname
|
||||
|
||||
sql = <<~SQL
|
||||
WITH new_row AS(
|
||||
INSERT INTO topic_links(
|
||||
post_id,
|
||||
user_id,
|
||||
topic_id,
|
||||
url,
|
||||
domain,
|
||||
internal,
|
||||
link_topic_id,
|
||||
link_post_id,
|
||||
quote,
|
||||
extension,
|
||||
reflection,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
:post_id,
|
||||
:user_id,
|
||||
:topic_id,
|
||||
:url,
|
||||
:domain,
|
||||
:internal,
|
||||
:link_topic_id,
|
||||
:link_post_id,
|
||||
:quote,
|
||||
:extension,
|
||||
:reflection,
|
||||
:now,
|
||||
:now
|
||||
)
|
||||
ON CONFLICT DO NOTHING
|
||||
RETURNING id
|
||||
)
|
||||
SELECT COALESCE(
|
||||
(SELECT id FROM new_row),
|
||||
(SELECT id FROM topic_links WHERE post_id = :post_id AND topic_id = :topic_id AND url = :url)
|
||||
), (SELECT id FROM new_row) IS NOT NULL
|
||||
SQL
|
||||
|
||||
topic_link_id, new_record = DB.query_single(sql,
|
||||
post_id: post_id,
|
||||
user_id: user_id,
|
||||
topic_id: topic_id,
|
||||
url: url,
|
||||
domain: domain,
|
||||
internal: internal,
|
||||
link_topic_id: link_topic_id,
|
||||
link_post_id: link_post_id,
|
||||
quote: quote,
|
||||
extension: extension,
|
||||
reflection: reflection,
|
||||
now: Time.now
|
||||
)
|
||||
|
||||
if new_record
|
||||
DB.after_commit do
|
||||
crawl_link_title(topic_link_id)
|
||||
end
|
||||
end
|
||||
|
||||
topic_link_id
|
||||
end
|
||||
|
||||
def self.ensure_entry_for(post, link, parsed)
|
||||
url = link.url
|
||||
internal = false
|
||||
|
@ -210,25 +304,20 @@ class TopicLink < ActiveRecord::Base
|
|||
url = url[0...TopicLink.max_url_length]
|
||||
return nil if parsed && parsed.host && parsed.host.length > TopicLink.max_domain_length
|
||||
|
||||
unless TopicLink.exists?(topic_id: post.topic_id, post_id: post.id, url: url)
|
||||
file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty?
|
||||
begin
|
||||
TopicLink.create(
|
||||
|
||||
safe_create_topic_link(
|
||||
post_id: post.id,
|
||||
user_id: post.user_id,
|
||||
topic_id: post.topic_id,
|
||||
url: url,
|
||||
domain: parsed.host || Discourse.current_hostname,
|
||||
domain: parsed.host,
|
||||
internal: internal,
|
||||
link_topic_id: topic&.id,
|
||||
link_post_id: reflected_post.try(:id),
|
||||
quote: link.is_quote,
|
||||
extension: file_extension
|
||||
extension: file_extension,
|
||||
)
|
||||
rescue ActiveRecord::RecordNotUnique
|
||||
# it's fine
|
||||
end
|
||||
end
|
||||
|
||||
reflected_id = nil
|
||||
|
||||
|
@ -236,12 +325,9 @@ class TopicLink < ActiveRecord::Base
|
|||
if topic && post.topic && topic.archetype != 'private_message' && post.topic.archetype != 'private_message' && post.topic.visible?
|
||||
prefix = Discourse.base_url_no_prefix
|
||||
reflected_url = "#{prefix}#{post.topic.relative_url(post.post_number)}"
|
||||
tl = TopicLink.find_by(topic_id: topic&.id,
|
||||
post_id: reflected_post&.id,
|
||||
url: reflected_url)
|
||||
|
||||
unless tl
|
||||
tl = TopicLink.create(user_id: post.user_id,
|
||||
reflected_id = safe_create_topic_link(
|
||||
user_id: post.user_id,
|
||||
topic_id: topic&.id,
|
||||
post_id: reflected_post&.id,
|
||||
url: reflected_url,
|
||||
|
@ -249,13 +335,11 @@ class TopicLink < ActiveRecord::Base
|
|||
reflection: true,
|
||||
internal: true,
|
||||
link_topic_id: post.topic_id,
|
||||
link_post_id: post.id)
|
||||
link_post_id: post.id
|
||||
)
|
||||
|
||||
end
|
||||
|
||||
reflected_id = tl.id if tl.persisted?
|
||||
end
|
||||
|
||||
[url, reflected_id]
|
||||
end
|
||||
|
||||
|
|
|
@ -28,31 +28,49 @@ describe TopicLink do
|
|||
end
|
||||
|
||||
describe 'external links' do
|
||||
fab!(:post2) do
|
||||
Fabricate(:post, raw: <<~RAW, user: user, topic: topic)
|
||||
it 'correctly handles links' do
|
||||
|
||||
non_png = "https://b.com/#{SecureRandom.hex}"
|
||||
|
||||
# prepare a title for one of the links
|
||||
stub_request(:get, non_png).
|
||||
with(headers: {
|
||||
'Accept' => '*/*',
|
||||
'Accept-Encoding' => 'gzip',
|
||||
'Host' => 'b.com',
|
||||
}).
|
||||
to_return(status: 200, body: "<html><head><title>amazing</title></head></html>", headers: {})
|
||||
|
||||
# so we run crawl_topic_links
|
||||
Jobs.run_immediately!
|
||||
|
||||
png_title = "#{SecureRandom.hex}.png"
|
||||
png = "https://awesome.com/#{png_title}"
|
||||
|
||||
post = Fabricate(:post, raw: <<~RAW, user: user, topic: topic)
|
||||
http://a.com/
|
||||
https://b.com/b
|
||||
#{non_png}
|
||||
http://#{'a' * 200}.com/invalid
|
||||
//b.com/#{'a' * 500}
|
||||
#{png}
|
||||
RAW
|
||||
end
|
||||
|
||||
before do
|
||||
TopicLink.extract_from(post2)
|
||||
end
|
||||
TopicLink.extract_from(post)
|
||||
|
||||
# we have a special rule for images title where we pull them out of the filename
|
||||
expect(topic.topic_links.where(url: png).pluck(:title).first).to eq(png_title)
|
||||
expect(topic.topic_links.where(url: non_png).pluck(:title).first).to eq("amazing")
|
||||
|
||||
it 'works' do
|
||||
expect(topic.topic_links.pluck(:url)).to contain_exactly(
|
||||
png,
|
||||
non_png,
|
||||
"http://a.com/",
|
||||
"https://b.com/b",
|
||||
"//b.com/#{'a' * 500}"[0...TopicLink.max_url_length]
|
||||
)
|
||||
end
|
||||
|
||||
it "doesn't reset them when rebaking" do
|
||||
old_ids = topic.topic_links.pluck(:id)
|
||||
|
||||
TopicLink.extract_from(post2)
|
||||
TopicLink.extract_from(post)
|
||||
|
||||
new_ids = topic.topic_links.pluck(:id)
|
||||
|
||||
|
@ -107,15 +125,17 @@ describe TopicLink do
|
|||
# this is subtle, but we had a bug were second time
|
||||
# TopicLink.extract_from was called a reflection was nuked
|
||||
2.times do
|
||||
topic.reload
|
||||
TopicLink.extract_from(linked_post)
|
||||
|
||||
topic.reload
|
||||
other_topic.reload
|
||||
|
||||
link = topic.topic_links.first
|
||||
expect(link).to be_present
|
||||
expect(link).to be_internal
|
||||
expect(link.url).to eq(url)
|
||||
expect(link.domain).to eq(test_uri.host)
|
||||
link.link_topic_id == other_topic.id
|
||||
expect(link.link_topic_id). to eq(other_topic.id)
|
||||
expect(link).not_to be_reflection
|
||||
|
||||
reflection = other_topic.topic_links.first
|
||||
|
|
Loading…
Reference in New Issue