From 7e3ea5d644516b10dacb6a17cdd53e20c987b060 Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Sat, 5 Apr 2014 14:47:25 -0400 Subject: [PATCH] Support for crawling topic links --- .../discourse/helpers/application_helpers.js | 22 +++++ .../components/topic-map.js.handlebars | 1 + .../stylesheets/desktop/topic-post.scss | 5 + app/jobs/regular/crawl_topic_link.rb | 94 +++++++++++++++++++ app/models/topic_link.rb | 18 +++- app/serializers/topic_link_serializer.rb | 7 +- ...20140404143501_add_title_to_topic_links.rb | 6 ++ spec/jobs/crawl_topic_link_spec.rb | 12 +++ 8 files changed, 160 insertions(+), 5 deletions(-) create mode 100644 app/jobs/regular/crawl_topic_link.rb create mode 100644 db/migrate/20140404143501_add_title_to_topic_links.rb create mode 100644 spec/jobs/crawl_topic_link_spec.rb diff --git a/app/assets/javascripts/discourse/helpers/application_helpers.js b/app/assets/javascripts/discourse/helpers/application_helpers.js index ef7ab697eee..2d26922b2b5 100644 --- a/app/assets/javascripts/discourse/helpers/application_helpers.js +++ b/app/assets/javascripts/discourse/helpers/application_helpers.js @@ -385,3 +385,25 @@ Handlebars.registerHelper('customHTML', function(name, contextString, options) { Ember.Handlebars.registerBoundHelper('humanSize', function(size) { return new Handlebars.SafeString(I18n.toHumanSize(size)); }); + +/** + Renders the domain for a link if it's not internal and has a title. + + @method link-domain + @for Handlebars +**/ +Handlebars.registerHelper('link-domain', function(property, options) { + var link = Em.get(this, property, options); + if (link) { + var internal = Em.get(link, 'internal'), + hasTitle = (!Em.isEmpty(Em.get(link, 'title'))); + if (hasTitle && !internal) { + var domain = Em.get(link, 'domain'); + if (!Em.isEmpty(domain)) { + var s = domain.split('.'); + domain = s[s.length-2] + "." + s[s.length-1]; + return new Handlebars.SafeString("(" + domain + ")"); + } + } + } +}); diff --git a/app/assets/javascripts/discourse/templates/components/topic-map.js.handlebars b/app/assets/javascripts/discourse/templates/components/topic-map.js.handlebars index 2f49ac5945b..8fa62796c4a 100644 --- a/app/assets/javascripts/discourse/templates/components/topic-map.js.handlebars +++ b/app/assets/javascripts/discourse/templates/components/topic-map.js.handlebars @@ -65,6 +65,7 @@ {{#if title}}{{title}}{{else}}{{shortenUrl url}}{{/if}} {{#unless internal}}{{/unless}} + {{link-domain this}} {{/groupedEach}} diff --git a/app/assets/stylesheets/desktop/topic-post.scss b/app/assets/stylesheets/desktop/topic-post.scss index aea3e3be873..44741759c62 100644 --- a/app/assets/stylesheets/desktop/topic-post.scss +++ b/app/assets/stylesheets/desktop/topic-post.scss @@ -286,6 +286,11 @@ a.star { list-style: none; } + span.domain { + font-size: 10px; + color: $secondary_text_color; + } + .avatars { > div { float: left; diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb new file mode 100644 index 00000000000..7ce42781820 --- /dev/null +++ b/app/jobs/regular/crawl_topic_link.rb @@ -0,0 +1,94 @@ +require 'open-uri' +require 'nokogiri' +require 'excon' + +module Jobs + class CrawlTopicLink < Jobs::Base + + class ReadEnough < Exception; end + + # Retrieve a header regardless of case sensitivity + def self.header_for(head, name) + header = head.headers.detect do |k, v| + name == k.downcase + end + header[1] if header + end + + def self.request_headers(uri) + { "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36", + "Accept" => "text/html", + "Host" => uri.host } + end + + # Follow any redirects that might exist + def self.final_uri(url, limit=5) + return if limit < 0 + + uri = URI(url) + return if uri.blank? || uri.host.blank? + headers = CrawlTopicLink.request_headers(uri) + head = Excon.head(url, read_timeout: 20, headers: headers) + if head.status == 200 + uri = nil unless header_for(head, 'content-type') =~ /text\/html/ + return uri + end + + location = header_for(head, 'location') + if location + location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/" + return final_uri(location, limit - 1) + end + + nil + end + + # Fetch the beginning of a HTML document at a url + def self.fetch_beginning(url) + uri = final_uri(url) + return "" unless uri + + result = "" + streamer = lambda do |chunk, remaining_bytes, total_bytes| + result << chunk + + # Using exceptions for flow control is really bad, but there really seems to + # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for + # that matter!) + raise ReadEnough.new if result.size > 1024 + end + Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri)) + result + + rescue ReadEnough + result + end + + def execute(args) + raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present? + topic_link = TopicLink.where(id: args[:topic_link_id], internal: false, crawled_at: nil).first + return if topic_link.blank? + + crawled = false + + result = CrawlTopicLink.fetch_beginning(topic_link.url) + doc = Nokogiri::HTML(result) + if doc + title = doc.at('title').try(:inner_text) + if title.present? + title.gsub!(/\n/, ' ') + title.gsub!(/ +/, ' ') + title.strip! + if title.present? + crawled = topic_link.update_attributes(title: title[0..255], crawled_at: Time.now) + end + end + end + rescue Exception + # If there was a connection error, do nothing + ensure + topic_link.update_column(:crawled_at, Time.now) if !crawled && topic_link.present? + end + + end +end diff --git a/app/models/topic_link.rb b/app/models/topic_link.rb index cc171dbde10..b1fd9f04048 100644 --- a/app/models/topic_link.rb +++ b/app/models/topic_link.rb @@ -18,6 +18,8 @@ class TopicLink < ActiveRecord::Base validate :link_to_self + after_commit :crawl_link_title + # Make sure a topic can't link to itself def link_to_self errors.add(:base, "can't link to the same topic") if (topic_id == link_topic_id) @@ -27,17 +29,18 @@ class TopicLink < ActiveRecord::Base # Sam: complicated reports are really hard in AR builder = SqlBuilder.new("SELECT ftl.url, - ft.title, + COALESCE(ft.title, ftl.title) AS title, ftl.link_topic_id, ftl.reflection, ftl.internal, + ftl.domain, MIN(ftl.user_id) AS user_id, SUM(clicks) AS clicks FROM topic_links AS ftl LEFT JOIN topics AS ft ON ftl.link_topic_id = ft.id LEFT JOIN categories AS c ON c.id = ft.category_id /*where*/ - GROUP BY ftl.url, ft.title, ftl.link_topic_id, ftl.reflection, ftl.internal + GROUP BY ftl.url, ft.title, ftl.title, ftl.link_topic_id, ftl.reflection, ftl.internal, ftl.domain ORDER BY clicks DESC") builder.where('ftl.topic_id = :topic_id', topic_id: topic_id) @@ -58,9 +61,10 @@ class TopicLink < ActiveRecord::Base l.post_id, l.url, l.clicks, - t.title, + COALESCE(t.title, l.title) AS title, l.internal, - l.reflection + l.reflection, + l.domain FROM topic_links l LEFT JOIN topics t ON t.id = l.link_topic_id LEFT JOIN categories AS c ON c.id = t.category_id @@ -87,6 +91,7 @@ class TopicLink < ActiveRecord::Base def self.extract_from(post) return unless post.present? + added_urls = [] TopicLink.transaction do added_urls = [] @@ -184,6 +189,11 @@ class TopicLink < ActiveRecord::Base end end end + + # Crawl a link's title after it's saved + def crawl_link_title + Jobs.enqueue(:crawl_topic_link, topic_link_id: id) + end end # == Schema Information diff --git a/app/serializers/topic_link_serializer.rb b/app/serializers/topic_link_serializer.rb index f73972a4138..e14c9e8d554 100644 --- a/app/serializers/topic_link_serializer.rb +++ b/app/serializers/topic_link_serializer.rb @@ -6,7 +6,8 @@ class TopicLinkSerializer < ApplicationSerializer :internal, :reflection, :clicks, - :user_id + :user_id, + :domain def url object['url'] @@ -40,4 +41,8 @@ class TopicLinkSerializer < ApplicationSerializer object['user_id'].present? end + def domain + object['domain'] + end + end diff --git a/db/migrate/20140404143501_add_title_to_topic_links.rb b/db/migrate/20140404143501_add_title_to_topic_links.rb new file mode 100644 index 00000000000..90d142782d0 --- /dev/null +++ b/db/migrate/20140404143501_add_title_to_topic_links.rb @@ -0,0 +1,6 @@ +class AddTitleToTopicLinks < ActiveRecord::Migration + def change + add_column :topic_links, :title, :string + add_column :topic_links, :crawled_at, :datetime + end +end diff --git a/spec/jobs/crawl_topic_link_spec.rb b/spec/jobs/crawl_topic_link_spec.rb new file mode 100644 index 00000000000..9224c499e4d --- /dev/null +++ b/spec/jobs/crawl_topic_link_spec.rb @@ -0,0 +1,12 @@ +require 'spec_helper' +require_dependency 'jobs/base' +require_dependency 'jobs/regular/crawl_topic_link' + +describe Jobs::CrawlTopicLink do + + let(:job) { Jobs::CrawlTopicLink.new } + + it "needs a topic_link_id" do + -> { job.execute({}) }.should raise_error(Discourse::InvalidParameters) + end +end