2013-12-31 14:37:43 -05:00
|
|
|
require_dependency 'nokogiri'
|
|
|
|
|
|
|
|
class TopicEmbed < ActiveRecord::Base
|
|
|
|
belongs_to :topic
|
|
|
|
belongs_to :post
|
|
|
|
validates_presence_of :embed_url
|
|
|
|
|
2014-03-19 16:33:21 -04:00
|
|
|
def self.normalize_url(url)
|
2014-04-03 15:35:31 -04:00
|
|
|
url.downcase.sub(/\/$/, '').sub(/\-+/, '-').strip
|
2014-03-19 16:33:21 -04:00
|
|
|
end
|
|
|
|
|
2014-04-02 15:54:21 -04:00
|
|
|
def self.imported_from_html(url)
|
|
|
|
"\n<hr>\n<small>#{I18n.t('embed.imported_from', link: "<a href='#{url}'>#{url}</a>")}</small>\n"
|
|
|
|
end
|
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
# Import an article from a source (RSS/Atom/Other)
|
|
|
|
def self.import(user, url, title, contents)
|
|
|
|
return unless url =~ /^https?\:\/\//
|
|
|
|
|
2014-03-18 18:02:33 -04:00
|
|
|
if SiteSetting.embed_truncate
|
|
|
|
contents = first_paragraph_from(contents)
|
|
|
|
end
|
2014-04-02 15:54:21 -04:00
|
|
|
contents << imported_from_html(url)
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2014-03-26 23:24:57 -04:00
|
|
|
url = normalize_url(url)
|
|
|
|
|
2014-03-23 15:22:02 -04:00
|
|
|
embed = TopicEmbed.where("lower(embed_url) = ?", url).first
|
2013-12-31 14:37:43 -05:00
|
|
|
content_sha1 = Digest::SHA1.hexdigest(contents)
|
|
|
|
post = nil
|
|
|
|
|
|
|
|
# If there is no embed, create a topic, post and the embed.
|
|
|
|
if embed.blank?
|
|
|
|
Topic.transaction do
|
2014-01-24 00:25:48 -05:00
|
|
|
creator = PostCreator.new(user,
|
|
|
|
title: title,
|
|
|
|
raw: absolutize_urls(url, contents),
|
|
|
|
skip_validations: true,
|
|
|
|
cook_method: Post.cook_methods[:raw_html],
|
|
|
|
category: SiteSetting.embed_category)
|
2013-12-31 14:37:43 -05:00
|
|
|
post = creator.create
|
|
|
|
if post.present?
|
|
|
|
TopicEmbed.create!(topic_id: post.topic_id,
|
|
|
|
embed_url: url,
|
|
|
|
content_sha1: content_sha1,
|
|
|
|
post_id: post.id)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
else
|
2014-03-18 18:02:33 -04:00
|
|
|
absolutize_urls(url, contents)
|
2013-12-31 14:37:43 -05:00
|
|
|
post = embed.post
|
|
|
|
# Update the topic if it changed
|
|
|
|
if content_sha1 != embed.content_sha1
|
|
|
|
revisor = PostRevisor.new(post)
|
|
|
|
revisor.revise!(user, absolutize_urls(url, contents), skip_validations: true, bypass_rate_limiter: true)
|
|
|
|
embed.update_column(:content_sha1, content_sha1)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
post
|
|
|
|
end
|
|
|
|
|
2014-04-01 18:16:56 -04:00
|
|
|
def self.find_remote(url)
|
2013-12-31 14:37:43 -05:00
|
|
|
require 'ruby-readability'
|
|
|
|
|
2014-03-19 16:33:21 -04:00
|
|
|
url = normalize_url(url)
|
2014-04-02 15:54:21 -04:00
|
|
|
original_uri = URI.parse(url)
|
|
|
|
doc = Readability::Document.new(open(url).read,
|
|
|
|
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote],
|
|
|
|
attributes: %w[href src],
|
|
|
|
remove_empty_nodes: false)
|
|
|
|
|
|
|
|
tags = {'img' => 'src', 'script' => 'src', 'a' => 'href'}
|
|
|
|
title = doc.title
|
|
|
|
doc = Nokogiri::HTML(doc.content)
|
|
|
|
doc.search(tags.keys.join(',')).each do |node|
|
|
|
|
url_param = tags[node.name]
|
|
|
|
src = node[url_param]
|
|
|
|
unless (src.empty?)
|
|
|
|
uri = URI.parse(src)
|
|
|
|
unless uri.host
|
|
|
|
uri.scheme = original_uri.scheme
|
|
|
|
uri.host = original_uri.host
|
|
|
|
node[url_param] = uri.to_s
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
[title, doc.to_html]
|
2014-04-01 18:16:56 -04:00
|
|
|
end
|
2013-12-31 14:37:43 -05:00
|
|
|
|
2014-04-01 18:16:56 -04:00
|
|
|
def self.import_remote(user, url, opts=nil)
|
|
|
|
opts = opts || {}
|
2014-04-02 15:54:21 -04:00
|
|
|
title, body = find_remote(url)
|
|
|
|
TopicEmbed.import(user, url, opts[:title] || title, body)
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
# Convert any relative URLs to absolute. RSS is annoying for this.
|
|
|
|
def self.absolutize_urls(url, contents)
|
2014-03-19 16:33:21 -04:00
|
|
|
url = normalize_url(url)
|
2013-12-31 14:37:43 -05:00
|
|
|
uri = URI(url)
|
|
|
|
prefix = "#{uri.scheme}://#{uri.host}"
|
|
|
|
prefix << ":#{uri.port}" if uri.port != 80 && uri.port != 443
|
|
|
|
|
2014-03-18 18:02:33 -04:00
|
|
|
fragment = Nokogiri::HTML.fragment("<div>#{contents}</div>")
|
2013-12-31 14:37:43 -05:00
|
|
|
fragment.css('a').each do |a|
|
|
|
|
href = a['href']
|
|
|
|
if href.present? && href.start_with?('/')
|
|
|
|
a['href'] = "#{prefix}/#{href.sub(/^\/+/, '')}"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
fragment.css('img').each do |a|
|
|
|
|
src = a['src']
|
|
|
|
if src.present? && src.start_with?('/')
|
|
|
|
a['src'] = "#{prefix}/#{src.sub(/^\/+/, '')}"
|
|
|
|
end
|
|
|
|
end
|
2014-03-18 18:02:33 -04:00
|
|
|
fragment.at('div').inner_html
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def self.topic_id_for_embed(embed_url)
|
2014-03-19 16:33:21 -04:00
|
|
|
embed_url = normalize_url(embed_url)
|
2014-03-23 15:22:02 -04:00
|
|
|
TopicEmbed.where("lower(embed_url) = ?", embed_url).pluck(:topic_id).first
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
|
|
|
|
2014-03-18 18:02:33 -04:00
|
|
|
def self.first_paragraph_from(html)
|
|
|
|
doc = Nokogiri::HTML(html)
|
|
|
|
|
|
|
|
result = ""
|
|
|
|
doc.css('p').each do |p|
|
|
|
|
if p.text.present?
|
|
|
|
result << p.to_s
|
|
|
|
return result if result.size >= 100
|
|
|
|
end
|
|
|
|
end
|
|
|
|
return result unless result.blank?
|
|
|
|
|
|
|
|
# If there is no first paragaph, return the first div (onebox)
|
|
|
|
doc.css('div').first
|
|
|
|
end
|
2014-04-03 11:30:43 -04:00
|
|
|
|
|
|
|
def self.expanded_for(post)
|
|
|
|
Rails.cache.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
|
|
|
|
url = TopicEmbed.where(topic_id: post.topic_id).pluck(:embed_url).first
|
|
|
|
title, body = TopicEmbed.find_remote(url)
|
|
|
|
body << TopicEmbed.imported_from_html(url)
|
|
|
|
body
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2013-12-31 14:37:43 -05:00
|
|
|
end
|
2014-02-06 19:07:36 -05:00
|
|
|
|
|
|
|
# == Schema Information
|
|
|
|
#
|
|
|
|
# Table name: topic_embeds
|
|
|
|
#
|
|
|
|
# id :integer not null, primary key
|
|
|
|
# topic_id :integer not null
|
|
|
|
# post_id :integer not null
|
|
|
|
# embed_url :string(255) not null
|
|
|
|
# content_sha1 :string(40) not null
|
|
|
|
# created_at :datetime
|
|
|
|
# updated_at :datetime
|
|
|
|
#
|
|
|
|
# Indexes
|
|
|
|
#
|
|
|
|
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
|
|
|
|
#
|