Support for crawling topic links
This commit is contained in:
parent
bb0baa6d7c
commit
7e3ea5d644
|
@ -385,3 +385,25 @@ Handlebars.registerHelper('customHTML', function(name, contextString, options) {
|
||||||
Ember.Handlebars.registerBoundHelper('humanSize', function(size) {
|
Ember.Handlebars.registerBoundHelper('humanSize', function(size) {
|
||||||
return new Handlebars.SafeString(I18n.toHumanSize(size));
|
return new Handlebars.SafeString(I18n.toHumanSize(size));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
Renders the domain for a link if it's not internal and has a title.
|
||||||
|
|
||||||
|
@method link-domain
|
||||||
|
@for Handlebars
|
||||||
|
**/
|
||||||
|
Handlebars.registerHelper('link-domain', function(property, options) {
|
||||||
|
var link = Em.get(this, property, options);
|
||||||
|
if (link) {
|
||||||
|
var internal = Em.get(link, 'internal'),
|
||||||
|
hasTitle = (!Em.isEmpty(Em.get(link, 'title')));
|
||||||
|
if (hasTitle && !internal) {
|
||||||
|
var domain = Em.get(link, 'domain');
|
||||||
|
if (!Em.isEmpty(domain)) {
|
||||||
|
var s = domain.split('.');
|
||||||
|
domain = s[s.length-2] + "." + s[s.length-1];
|
||||||
|
return new Handlebars.SafeString("<span class='domain'>(" + domain + ")</span>");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
|
@ -65,6 +65,7 @@
|
||||||
{{#if title}}{{title}}{{else}}{{shortenUrl url}}{{/if}}
|
{{#if title}}{{title}}{{else}}{{shortenUrl url}}{{/if}}
|
||||||
{{#unless internal}}<i class='fa fa-external-link'></i>{{/unless}}
|
{{#unless internal}}<i class='fa fa-external-link'></i>{{/unless}}
|
||||||
</a>
|
</a>
|
||||||
|
{{link-domain this}}
|
||||||
</li>
|
</li>
|
||||||
{{/groupedEach}}
|
{{/groupedEach}}
|
||||||
</ul>
|
</ul>
|
||||||
|
|
|
@ -286,6 +286,11 @@ a.star {
|
||||||
list-style: none;
|
list-style: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
span.domain {
|
||||||
|
font-size: 10px;
|
||||||
|
color: $secondary_text_color;
|
||||||
|
}
|
||||||
|
|
||||||
.avatars {
|
.avatars {
|
||||||
> div {
|
> div {
|
||||||
float: left;
|
float: left;
|
||||||
|
|
|
@ -0,0 +1,94 @@
|
||||||
|
require 'open-uri'
|
||||||
|
require 'nokogiri'
|
||||||
|
require 'excon'
|
||||||
|
|
||||||
|
module Jobs
|
||||||
|
class CrawlTopicLink < Jobs::Base
|
||||||
|
|
||||||
|
class ReadEnough < Exception; end
|
||||||
|
|
||||||
|
# Retrieve a header regardless of case sensitivity
|
||||||
|
def self.header_for(head, name)
|
||||||
|
header = head.headers.detect do |k, v|
|
||||||
|
name == k.downcase
|
||||||
|
end
|
||||||
|
header[1] if header
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.request_headers(uri)
|
||||||
|
{ "User-Agent" => "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
|
||||||
|
"Accept" => "text/html",
|
||||||
|
"Host" => uri.host }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Follow any redirects that might exist
|
||||||
|
def self.final_uri(url, limit=5)
|
||||||
|
return if limit < 0
|
||||||
|
|
||||||
|
uri = URI(url)
|
||||||
|
return if uri.blank? || uri.host.blank?
|
||||||
|
headers = CrawlTopicLink.request_headers(uri)
|
||||||
|
head = Excon.head(url, read_timeout: 20, headers: headers)
|
||||||
|
if head.status == 200
|
||||||
|
uri = nil unless header_for(head, 'content-type') =~ /text\/html/
|
||||||
|
return uri
|
||||||
|
end
|
||||||
|
|
||||||
|
location = header_for(head, 'location')
|
||||||
|
if location
|
||||||
|
location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
|
||||||
|
return final_uri(location, limit - 1)
|
||||||
|
end
|
||||||
|
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Fetch the beginning of a HTML document at a url
|
||||||
|
def self.fetch_beginning(url)
|
||||||
|
uri = final_uri(url)
|
||||||
|
return "" unless uri
|
||||||
|
|
||||||
|
result = ""
|
||||||
|
streamer = lambda do |chunk, remaining_bytes, total_bytes|
|
||||||
|
result << chunk
|
||||||
|
|
||||||
|
# Using exceptions for flow control is really bad, but there really seems to
|
||||||
|
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
|
||||||
|
# that matter!)
|
||||||
|
raise ReadEnough.new if result.size > 1024
|
||||||
|
end
|
||||||
|
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri))
|
||||||
|
result
|
||||||
|
|
||||||
|
rescue ReadEnough
|
||||||
|
result
|
||||||
|
end
|
||||||
|
|
||||||
|
def execute(args)
|
||||||
|
raise Discourse::InvalidParameters.new(:topic_link_id) unless args[:topic_link_id].present?
|
||||||
|
topic_link = TopicLink.where(id: args[:topic_link_id], internal: false, crawled_at: nil).first
|
||||||
|
return if topic_link.blank?
|
||||||
|
|
||||||
|
crawled = false
|
||||||
|
|
||||||
|
result = CrawlTopicLink.fetch_beginning(topic_link.url)
|
||||||
|
doc = Nokogiri::HTML(result)
|
||||||
|
if doc
|
||||||
|
title = doc.at('title').try(:inner_text)
|
||||||
|
if title.present?
|
||||||
|
title.gsub!(/\n/, ' ')
|
||||||
|
title.gsub!(/ +/, ' ')
|
||||||
|
title.strip!
|
||||||
|
if title.present?
|
||||||
|
crawled = topic_link.update_attributes(title: title[0..255], crawled_at: Time.now)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
rescue Exception
|
||||||
|
# If there was a connection error, do nothing
|
||||||
|
ensure
|
||||||
|
topic_link.update_column(:crawled_at, Time.now) if !crawled && topic_link.present?
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
end
|
|
@ -18,6 +18,8 @@ class TopicLink < ActiveRecord::Base
|
||||||
|
|
||||||
validate :link_to_self
|
validate :link_to_self
|
||||||
|
|
||||||
|
after_commit :crawl_link_title
|
||||||
|
|
||||||
# Make sure a topic can't link to itself
|
# Make sure a topic can't link to itself
|
||||||
def link_to_self
|
def link_to_self
|
||||||
errors.add(:base, "can't link to the same topic") if (topic_id == link_topic_id)
|
errors.add(:base, "can't link to the same topic") if (topic_id == link_topic_id)
|
||||||
|
@ -27,17 +29,18 @@ class TopicLink < ActiveRecord::Base
|
||||||
|
|
||||||
# Sam: complicated reports are really hard in AR
|
# Sam: complicated reports are really hard in AR
|
||||||
builder = SqlBuilder.new("SELECT ftl.url,
|
builder = SqlBuilder.new("SELECT ftl.url,
|
||||||
ft.title,
|
COALESCE(ft.title, ftl.title) AS title,
|
||||||
ftl.link_topic_id,
|
ftl.link_topic_id,
|
||||||
ftl.reflection,
|
ftl.reflection,
|
||||||
ftl.internal,
|
ftl.internal,
|
||||||
|
ftl.domain,
|
||||||
MIN(ftl.user_id) AS user_id,
|
MIN(ftl.user_id) AS user_id,
|
||||||
SUM(clicks) AS clicks
|
SUM(clicks) AS clicks
|
||||||
FROM topic_links AS ftl
|
FROM topic_links AS ftl
|
||||||
LEFT JOIN topics AS ft ON ftl.link_topic_id = ft.id
|
LEFT JOIN topics AS ft ON ftl.link_topic_id = ft.id
|
||||||
LEFT JOIN categories AS c ON c.id = ft.category_id
|
LEFT JOIN categories AS c ON c.id = ft.category_id
|
||||||
/*where*/
|
/*where*/
|
||||||
GROUP BY ftl.url, ft.title, ftl.link_topic_id, ftl.reflection, ftl.internal
|
GROUP BY ftl.url, ft.title, ftl.title, ftl.link_topic_id, ftl.reflection, ftl.internal, ftl.domain
|
||||||
ORDER BY clicks DESC")
|
ORDER BY clicks DESC")
|
||||||
|
|
||||||
builder.where('ftl.topic_id = :topic_id', topic_id: topic_id)
|
builder.where('ftl.topic_id = :topic_id', topic_id: topic_id)
|
||||||
|
@ -58,9 +61,10 @@ class TopicLink < ActiveRecord::Base
|
||||||
l.post_id,
|
l.post_id,
|
||||||
l.url,
|
l.url,
|
||||||
l.clicks,
|
l.clicks,
|
||||||
t.title,
|
COALESCE(t.title, l.title) AS title,
|
||||||
l.internal,
|
l.internal,
|
||||||
l.reflection
|
l.reflection,
|
||||||
|
l.domain
|
||||||
FROM topic_links l
|
FROM topic_links l
|
||||||
LEFT JOIN topics t ON t.id = l.link_topic_id
|
LEFT JOIN topics t ON t.id = l.link_topic_id
|
||||||
LEFT JOIN categories AS c ON c.id = t.category_id
|
LEFT JOIN categories AS c ON c.id = t.category_id
|
||||||
|
@ -87,6 +91,7 @@ class TopicLink < ActiveRecord::Base
|
||||||
def self.extract_from(post)
|
def self.extract_from(post)
|
||||||
return unless post.present?
|
return unless post.present?
|
||||||
|
|
||||||
|
added_urls = []
|
||||||
TopicLink.transaction do
|
TopicLink.transaction do
|
||||||
|
|
||||||
added_urls = []
|
added_urls = []
|
||||||
|
@ -184,6 +189,11 @@ class TopicLink < ActiveRecord::Base
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Crawl a link's title after it's saved
|
||||||
|
def crawl_link_title
|
||||||
|
Jobs.enqueue(:crawl_topic_link, topic_link_id: id)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# == Schema Information
|
# == Schema Information
|
||||||
|
|
|
@ -6,7 +6,8 @@ class TopicLinkSerializer < ApplicationSerializer
|
||||||
:internal,
|
:internal,
|
||||||
:reflection,
|
:reflection,
|
||||||
:clicks,
|
:clicks,
|
||||||
:user_id
|
:user_id,
|
||||||
|
:domain
|
||||||
|
|
||||||
def url
|
def url
|
||||||
object['url']
|
object['url']
|
||||||
|
@ -40,4 +41,8 @@ class TopicLinkSerializer < ApplicationSerializer
|
||||||
object['user_id'].present?
|
object['user_id'].present?
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def domain
|
||||||
|
object['domain']
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
class AddTitleToTopicLinks < ActiveRecord::Migration
|
||||||
|
def change
|
||||||
|
add_column :topic_links, :title, :string
|
||||||
|
add_column :topic_links, :crawled_at, :datetime
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,12 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
require_dependency 'jobs/base'
|
||||||
|
require_dependency 'jobs/regular/crawl_topic_link'
|
||||||
|
|
||||||
|
describe Jobs::CrawlTopicLink do
|
||||||
|
|
||||||
|
let(:job) { Jobs::CrawlTopicLink.new }
|
||||||
|
|
||||||
|
it "needs a topic_link_id" do
|
||||||
|
-> { job.execute({}) }.should raise_error(Discourse::InvalidParameters)
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue