diff --git a/app/models/topic_link.rb b/app/models/topic_link.rb index afc6faea007..3f992d9f8ff 100644 --- a/app/models/topic_link.rb +++ b/app/models/topic_link.rb @@ -140,9 +140,12 @@ class TopicLink < ActiveRecord::Base end - # Crawl a link's title after it's saved + def self.crawl_link_title(topic_link_id) + Jobs.enqueue(:crawl_topic_link, topic_link_id: topic_link_id) + end + def crawl_link_title - Jobs.enqueue(:crawl_topic_link, topic_link_id: id) + TopicLink.crawl_link_title(id) end def self.duplicate_lookup(topic) @@ -167,6 +170,97 @@ class TopicLink < ActiveRecord::Base private + # This pattern is used to create topic links very efficiently with minimal + # errors under heavy concurrent use + # + # It avoids a SELECT to find out if the record is there and minimizes all + # the work it needs to do in case a record is missing + # + # It handles calling the required callback and has parity with Rails implementation + # + # Usually we would rely on ActiveRecord but in this case we have had lots of churn + # around creation of topic links leading to hard to debug log messages in production + # + def self.safe_create_topic_link( + post_id:, + user_id:, + topic_id:, + url:, + domain: nil, + internal: false, + link_topic_id: nil, + link_post_id: nil, + quote: false, + extension: nil, + reflection: false + ) + + domain ||= Discourse.current_hostname + + sql = <<~SQL + WITH new_row AS( + INSERT INTO topic_links( + post_id, + user_id, + topic_id, + url, + domain, + internal, + link_topic_id, + link_post_id, + quote, + extension, + reflection, + created_at, + updated_at + ) VALUES ( + :post_id, + :user_id, + :topic_id, + :url, + :domain, + :internal, + :link_topic_id, + :link_post_id, + :quote, + :extension, + :reflection, + :now, + :now + ) + ON CONFLICT DO NOTHING + RETURNING id + ) + SELECT COALESCE( + (SELECT id FROM new_row), + (SELECT id FROM topic_links WHERE post_id = :post_id AND topic_id = :topic_id AND url = :url) + ), (SELECT id FROM new_row) IS NOT NULL + SQL + + topic_link_id, new_record = DB.query_single(sql, + post_id: post_id, + user_id: user_id, + topic_id: topic_id, + url: url, + domain: domain, + internal: internal, + link_topic_id: link_topic_id, + link_post_id: link_post_id, + quote: quote, + extension: extension, + reflection: reflection, + now: Time.now + ) + + if new_record + DB.after_commit do + crawl_link_title(topic_link_id) + end + end + + topic_link_id + end + def self.ensure_entry_for(post, link, parsed) url = link.url internal = false @@ -210,25 +304,20 @@ class TopicLink < ActiveRecord::Base url = url[0...TopicLink.max_url_length] return nil if parsed && parsed.host && parsed.host.length > TopicLink.max_domain_length - unless TopicLink.exists?(topic_id: post.topic_id, post_id: post.id, url: url) - file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty? - begin - TopicLink.create( - post_id: post.id, - user_id: post.user_id, - topic_id: post.topic_id, - url: url, - domain: parsed.host || Discourse.current_hostname, - internal: internal, - link_topic_id: topic&.id, - link_post_id: reflected_post.try(:id), - quote: link.is_quote, - extension: file_extension - ) - rescue ActiveRecord::RecordNotUnique - # it's fine - end - end + file_extension = File.extname(parsed.path)[1..10].downcase unless parsed.path.nil? || File.extname(parsed.path).empty? + + safe_create_topic_link( + post_id: post.id, + user_id: post.user_id, + topic_id: post.topic_id, + url: url, + domain: parsed.host, + internal: internal, + link_topic_id: topic&.id, + link_post_id: reflected_post.try(:id), + quote: link.is_quote, + extension: file_extension, + ) reflected_id = nil @@ -236,24 +325,19 @@ class TopicLink < ActiveRecord::Base if topic && post.topic && topic.archetype != 'private_message' && post.topic.archetype != 'private_message' && post.topic.visible? prefix = Discourse.base_url_no_prefix reflected_url = "#{prefix}#{post.topic.relative_url(post.post_number)}" - tl = TopicLink.find_by(topic_id: topic&.id, - post_id: reflected_post&.id, - url: reflected_url) - unless tl - tl = TopicLink.create(user_id: post.user_id, - topic_id: topic&.id, - post_id: reflected_post&.id, - url: reflected_url, - domain: Discourse.current_hostname, - reflection: true, - internal: true, - link_topic_id: post.topic_id, - link_post_id: post.id) + reflected_id = safe_create_topic_link( + user_id: post.user_id, + topic_id: topic&.id, + post_id: reflected_post&.id, + url: reflected_url, + domain: Discourse.current_hostname, + reflection: true, + internal: true, + link_topic_id: post.topic_id, + link_post_id: post.id + ) - end - - reflected_id = tl.id if tl.persisted? end [url, reflected_id] diff --git a/spec/models/topic_link_spec.rb b/spec/models/topic_link_spec.rb index 2f8a0bec33e..c1a11711a7e 100644 --- a/spec/models/topic_link_spec.rb +++ b/spec/models/topic_link_spec.rb @@ -28,31 +28,49 @@ describe TopicLink do end describe 'external links' do - fab!(:post2) do - Fabricate(:post, raw: <<~RAW, user: user, topic: topic) + it 'correctly handles links' do + + non_png = "https://b.com/#{SecureRandom.hex}" + + # prepare a title for one of the links + stub_request(:get, non_png). + with(headers: { + 'Accept' => '*/*', + 'Accept-Encoding' => 'gzip', + 'Host' => 'b.com', + }). + to_return(status: 200, body: "amazing", headers: {}) + + # so we run crawl_topic_links + Jobs.run_immediately! + + png_title = "#{SecureRandom.hex}.png" + png = "https://awesome.com/#{png_title}" + + post = Fabricate(:post, raw: <<~RAW, user: user, topic: topic) http://a.com/ - https://b.com/b + #{non_png} http://#{'a' * 200}.com/invalid //b.com/#{'a' * 500} + #{png} RAW - end - before do - TopicLink.extract_from(post2) - end + TopicLink.extract_from(post) + + # we have a special rule for images title where we pull them out of the filename + expect(topic.topic_links.where(url: png).pluck(:title).first).to eq(png_title) + expect(topic.topic_links.where(url: non_png).pluck(:title).first).to eq("amazing") - it 'works' do expect(topic.topic_links.pluck(:url)).to contain_exactly( + png, + non_png, "http://a.com/", - "https://b.com/b", "//b.com/#{'a' * 500}"[0...TopicLink.max_url_length] ) - end - it "doesn't reset them when rebaking" do old_ids = topic.topic_links.pluck(:id) - TopicLink.extract_from(post2) + TopicLink.extract_from(post) new_ids = topic.topic_links.pluck(:id) @@ -107,15 +125,17 @@ describe TopicLink do # this is subtle, but we had a bug were second time # TopicLink.extract_from was called a reflection was nuked 2.times do - topic.reload TopicLink.extract_from(linked_post) + topic.reload + other_topic.reload + link = topic.topic_links.first expect(link).to be_present expect(link).to be_internal expect(link.url).to eq(url) expect(link.domain).to eq(test_uri.host) - link.link_topic_id == other_topic.id + expect(link.link_topic_id). to eq(other_topic.id) expect(link).not_to be_reflection reflection = other_topic.topic_links.first