Support for crawling topic links

2014-04-05 14:47:25 -04:00 · 2014-04-05 14:47:25 -04:00 · 7e3ea5d644
parent bb0baa6d7c
commit 7e3ea5d644
8 changed files with 160 additions and 5 deletions
--- a/app/assets/javascripts/discourse/helpers/application_helpers.js
+++ b/app/assets/javascripts/discourse/helpers/application_helpers.js
@ -385,3 +385,25 @@ Handlebars.registerHelper('customHTML', function(name, contextString, options) {
 Ember.Handlebars.registerBoundHelper('humanSize', function(size) {
  return new Handlebars.SafeString(I18n.toHumanSize(size));
 });
 /**
  Renders the domain for a link if it's not internal and has a title.
  @method link-domain
  @for Handlebars
 **/
 Handlebars.registerHelper('link-domain', function(property, options) {
  var link = Em.get(this, property, options);
  if (link) {
    var internal = Em.get(link, 'internal'),
        hasTitle = (!Em.isEmpty(Em.get(link, 'title')));
    if (hasTitle && !internal) {
      var domain = Em.get(link, 'domain');
      if (!Em.isEmpty(domain)) {
        var s = domain.split('.');
        domain = s[s.length-2] + "." + s[s.length-1];
        return new Handlebars.SafeString("<span class='domain'>(" + domain + ")</span>");
      }
    }
  }
 });
--- a/app/assets/javascripts/discourse/templates/components/topic-map.js.handlebars
+++ b/app/assets/javascripts/discourse/templates/components/topic-map.js.handlebars
@ -65,6 +65,7 @@
                {{#if title}}{{title}}{{else}}{{shortenUrl url}}{{/if}}
                {{#unless internal}}<i class='fa fa-external-link'></i>{{/unless}}
              </a>
              {{link-domain this}}
            </li>
          {{/groupedEach}}
        </ul>
--- a/app/assets/stylesheets/desktop/topic-post.scss
+++ b/app/assets/stylesheets/desktop/topic-post.scss
@ -286,6 +286,11 @@ a.star {
    list-style: none;
  }
  span.domain {
    font-size: 10px;
    color: $secondary_text_color;
  }
  .avatars {
    > div {
      float: left;
--- a/app/jobs/regular/crawl_topic_link.rb
+++ b/app/jobs/regular/crawl_topic_link.rb
@ -0,0 +1,94 @@
 require 'open-uri'
 require 'nokogiri'
 require 'excon'
 module Jobs
  class CrawlTopicLink < Jobs::Base
    class ReadEnough < Exception; end
    # Retrieve a header regardless of case sensitivity
    def self.header_for(head, name)
      header = head.headers.detect do |k, v|
        name == k.downcase
      end
      header[1] if header
    end
    def self.request_headers(uri)
      { "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
        "Accept" => "text/html",
        "Host" => uri.host }
    end
    # Follow any redirects that might exist
    def self.final_uri(url, limit=5)
      return if limit < 0
      uri = URI(url)
      return if uri.blank? || uri.host.blank?
      headers = CrawlTopicLink.request_headers(uri)
      head = Excon.head(url, read_timeout: 20, headers: headers)
      if head.status == 200
        uri = nil unless header_for(head, 'content-type') =~ /text\/html/
        return uri
      end
      location = header_for(head, 'location')
      if location
        location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
        return final_uri(location, limit - 1)
      end
      nil
    end
    # Fetch the beginning of a HTML document at a url
    def self.fetch_beginning(url)
      uri = final_uri(url)
      return "" unless uri
      result = ""
      streamer = lambda do |chunk, remaining_bytes, total_bytes|
        result << chunk
        # Using exceptions for flow control is really bad, but there really seems to
        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
        # that matter!)
        raise ReadEnough.new if result.size > 1024
      end
      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri))
      result
    rescue ReadEnough
      result
    end
    def execute(args)
      raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
      topic_link = TopicLink.where(id: args[:topic_link_id], internal: false, crawled_at: nil).first
      return if topic_link.blank?
      crawled = false
      result = CrawlTopicLink.fetch_beginning(topic_link.url)
      doc = Nokogiri::HTML(result)
      if doc
        title = doc.at('title').try(:inner_text)
        if title.present?
          title.gsub!(/\n/, ' ')
          title.gsub!(/ +/, ' ')
          title.strip!
          if title.present?
            crawled = topic_link.update_attributes(title: title[0..255], crawled_at: Time.now)
          end
        end
      end
    rescue Exception
      # If there was a connection error, do nothing
    ensure
      topic_link.update_column(:crawled_at, Time.now) if !crawled && topic_link.present?
    end
  end
 end
--- a/app/models/topic_link.rb
+++ b/app/models/topic_link.rb
@ -18,6 +18,8 @@ class TopicLink < ActiveRecord::Base
  validate :link_to_self
  after_commit :crawl_link_title
  # Make sure a topic can't link to itself
  def link_to_self
    errors.add(:base, "can't link to the same topic") if (topic_id == link_topic_id)
@ -27,17 +29,18 @@ class TopicLink < ActiveRecord::Base
    # Sam: complicated reports are really hard in AR
    builder = SqlBuilder.new("SELECT ftl.url,
-                     ft.title,
+                     COALESCE(ft.title, ftl.title) AS title,
                     ftl.link_topic_id,
                     ftl.reflection,
                     ftl.internal,
                     ftl.domain,
                     MIN(ftl.user_id) AS user_id,
                     SUM(clicks) AS clicks
              FROM topic_links AS ftl
              LEFT JOIN topics AS ft ON ftl.link_topic_id = ft.id
              LEFT JOIN categories AS c ON c.id = ft.category_id
              /*where*/
-              GROUP BY ftl.url, ft.title, ftl.link_topic_id, ftl.reflection, ftl.internal
+              GROUP BY ftl.url, ft.title, ftl.title, ftl.link_topic_id, ftl.reflection, ftl.internal, ftl.domain
              ORDER BY clicks DESC")
    builder.where('ftl.topic_id = :topic_id', topic_id: topic_id)
@ -58,9 +61,10 @@ class TopicLink < ActiveRecord::Base
                      l.post_id,
                      l.url,
                      l.clicks,
-                      t.title,
+                      COALESCE(t.title, l.title) AS title,
                      l.internal,
-                      l.reflection
+                      l.reflection,
                      l.domain
              FROM topic_links l
              LEFT JOIN topics t ON t.id = l.link_topic_id
              LEFT JOIN categories AS c ON c.id = t.category_id
@ -87,6 +91,7 @@ class TopicLink < ActiveRecord::Base
  def self.extract_from(post)
    return unless post.present?
    added_urls = []
    TopicLink.transaction do
      added_urls = []
@ -184,6 +189,11 @@ class TopicLink < ActiveRecord::Base
      end
    end
  end
  # Crawl a link's title after it's saved
  def crawl_link_title
    Jobs.enqueue(:crawl_topic_link, topic_link_id: id)
  end
 end
 # == Schema Information
--- a/app/serializers/topic_link_serializer.rb
+++ b/app/serializers/topic_link_serializer.rb
@ -6,7 +6,8 @@ class TopicLinkSerializer < ApplicationSerializer
             :internal,
             :reflection,
             :clicks,
-             :user_id
+             :user_id,
             :domain
  def url
    object['url']
@ -40,4 +41,8 @@ class TopicLinkSerializer < ApplicationSerializer
    object['user_id'].present?
  end
  def domain
    object['domain']
  end
 end
--- a/db/migrate/20140404143501_add_title_to_topic_links.rb
+++ b/db/migrate/20140404143501_add_title_to_topic_links.rb
@ -0,0 +1,6 @@
 class AddTitleToTopicLinks < ActiveRecord::Migration
  def change
    add_column :topic_links, :title, :string
    add_column :topic_links, :crawled_at, :datetime
  end
 end
--- a/spec/jobs/crawl_topic_link_spec.rb
+++ b/spec/jobs/crawl_topic_link_spec.rb
@ -0,0 +1,12 @@
 require 'spec_helper'
 require_dependency 'jobs/base'
 require_dependency 'jobs/regular/crawl_topic_link'
 describe Jobs::CrawlTopicLink do
  let(:job) { Jobs::CrawlTopicLink.new }
  it "needs a topic_link_id" do
    -> { job.execute({}) }.should raise_error(Discourse::InvalidParameters)
  end
 end