PERF: avoid race conditions when creating topic links

Previously the code was very race condition prone leading to
odd failures in production

It was re-written in raw SQL to avoid conditions where rows
conflict on inserts

There is no clean way in ActiveRecord to do:

Insert, on conflict do nothing and return existing id.

This also increases test coverage, we were previously not testing
the code responsible for crawling external sites directly
This commit is contained in:
Sam Saffron 2020-05-13 16:05:39 +10:00
parent ca8d11635c
commit 7f841dc21f
No known key found for this signature in database
GPG Key ID: B9606168D2FFD9F5
2 changed files with 155 additions and 51 deletions

View File

@ -140,9 +140,12 @@ class TopicLink < ActiveRecord::Base
end
# Crawl a link's title after it's saved
def self.crawl_link_title(topic_link_id)
Jobs.enqueue(:crawl_topic_link, topic_link_id: topic_link_id)
end
def crawl_link_title
Jobs.enqueue(:crawl_topic_link, topic_link_id: id)
TopicLink.crawl_link_title(id)
end
def self.duplicate_lookup(topic)
@ -167,6 +170,97 @@ class TopicLink < ActiveRecord::Base
private
# This pattern is used to create topic links very efficiently with minimal
# errors under heavy concurrent use
#
# It avoids a SELECT to find out if the record is there and minimizes all
# the work it needs to do in case a record is missing
#
# It handles calling the required callback and has parity with Rails implementation
#
# Usually we would rely on ActiveRecord but in this case we have had lots of churn
# around creation of topic links leading to hard to debug log messages in production
#
def self.safe_create_topic_link(
post_id:,
user_id:,
topic_id:,
url:,
domain: nil,
internal: false,
link_topic_id: nil,
link_post_id: nil,
quote: false,
extension: nil,
reflection: false
)
domain ||= Discourse.current_hostname
sql = <<~SQL
WITH new_row AS(
INSERT INTO topic_links(
post_id,
user_id,
topic_id,
url,
domain,
internal,
link_topic_id,
link_post_id,
quote,
extension,
reflection,
created_at,
updated_at
) VALUES (
:post_id,
:user_id,
:topic_id,
:url,
:domain,
:internal,
:link_topic_id,
:link_post_id,
:quote,
:extension,
:reflection,
:now,
:now
)
ON CONFLICT DO NOTHING
RETURNING id
)
SELECT COALESCE(
(SELECT id FROM new_row),
(SELECT id FROM topic_links WHERE post_id = :post_id AND topic_id = :topic_id AND url = :url)
), (SELECT id FROM new_row) IS NOT NULL
SQL
topic_link_id, new_record = DB.query_single(sql,
post_id: post_id,
user_id: user_id,
topic_id: topic_id,
url: url,
domain: domain,
internal: internal,
link_topic_id: link_topic_id,
link_post_id: link_post_id,
quote: quote,
extension: extension,
reflection: reflection,
now: Time.now
)
if new_record
DB.after_commit do
crawl_link_title(topic_link_id)
end
end
topic_link_id
end
def self.ensure_entry_for(post, link, parsed)
url = link.url
internal = false
@ -210,25 +304,20 @@ class TopicLink < ActiveRecord::Base
url = url[0...TopicLink.max_url_length]
return nil if parsed && parsed.host && parsed.host.length > TopicLink.max_domain_length
unless TopicLink.exists?(topic_id: post.topic_id, post_id: post.id, url: url)
file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty?
begin
TopicLink.create(
safe_create_topic_link(
post_id: post.id,
user_id: post.user_id,
topic_id: post.topic_id,
url: url,
domain: parsed.host || Discourse.current_hostname,
domain: parsed.host,
internal: internal,
link_topic_id: topic&.id,
link_post_id: reflected_post.try(:id),
quote: link.is_quote,
extension: file_extension
extension: file_extension,
)
rescue ActiveRecord::RecordNotUnique
# it's fine
end
end
reflected_id = nil
@ -236,12 +325,9 @@ class TopicLink < ActiveRecord::Base
if topic && post.topic && topic.archetype != 'private_message' && post.topic.archetype != 'private_message' && post.topic.visible?
prefix = Discourse.base_url_no_prefix
reflected_url = "#{prefix}#{post.topic.relative_url(post.post_number)}"
tl = TopicLink.find_by(topic_id: topic&.id,
post_id: reflected_post&.id,
url: reflected_url)
unless tl
tl = TopicLink.create(user_id: post.user_id,
reflected_id = safe_create_topic_link(
user_id: post.user_id,
topic_id: topic&.id,
post_id: reflected_post&.id,
url: reflected_url,
@ -249,13 +335,11 @@ class TopicLink < ActiveRecord::Base
reflection: true,
internal: true,
link_topic_id: post.topic_id,
link_post_id: post.id)
link_post_id: post.id
)
end
reflected_id = tl.id if tl.persisted?
end
[url, reflected_id]
end

View File

@ -28,31 +28,49 @@ describe TopicLink do
end
describe 'external links' do
fab!(:post2) do
Fabricate(:post, raw: <<~RAW, user: user, topic: topic)
it 'correctly handles links' do
non_png = "https://b.com/#{SecureRandom.hex}"
# prepare a title for one of the links
stub_request(:get, non_png).
with(headers: {
'Accept' => '*/*',
'Accept-Encoding' => 'gzip',
'Host' => 'b.com',
}).
to_return(status: 200, body: "<html><head><title>amazing</title></head></html>", headers: {})
# so we run crawl_topic_links
Jobs.run_immediately!
png_title = "#{SecureRandom.hex}.png"
png = "https://awesome.com/#{png_title}"
post = Fabricate(:post, raw: <<~RAW, user: user, topic: topic)
http://a.com/
https://b.com/b
#{non_png}
http://#{'a' * 200}.com/invalid
//b.com/#{'a' * 500}
#{png}
RAW
end
before do
TopicLink.extract_from(post2)
end
TopicLink.extract_from(post)
# we have a special rule for images title where we pull them out of the filename
expect(topic.topic_links.where(url: png).pluck(:title).first).to eq(png_title)
expect(topic.topic_links.where(url: non_png).pluck(:title).first).to eq("amazing")
it 'works' do
expect(topic.topic_links.pluck(:url)).to contain_exactly(
png,
non_png,
"http://a.com/",
"https://b.com/b",
"//b.com/#{'a' * 500}"[0...TopicLink.max_url_length]
)
end
it "doesn't reset them when rebaking" do
old_ids = topic.topic_links.pluck(:id)
TopicLink.extract_from(post2)
TopicLink.extract_from(post)
new_ids = topic.topic_links.pluck(:id)
@ -107,15 +125,17 @@ describe TopicLink do
# this is subtle, but we had a bug were second time
# TopicLink.extract_from was called a reflection was nuked
2.times do
topic.reload
TopicLink.extract_from(linked_post)
topic.reload
other_topic.reload
link = topic.topic_links.first
expect(link).to be_present
expect(link).to be_internal
expect(link.url).to eq(url)
expect(link.domain).to eq(test_uri.host)
link.link_topic_id == other_topic.id
expect(link.link_topic_id). to eq(other_topic.id)
expect(link).not_to be_reflection
reflection = other_topic.topic_links.first