FEATURE: Support author meta tags for embedding

This commit is contained in:
Robin Ward 2016-08-30 12:01:04 -04:00
parent 3d8e75c6ca
commit 7da44e3bf0
2 changed files with 68 additions and 26 deletions

View File

@ -6,6 +6,10 @@ class TopicEmbed < ActiveRecord::Base
validates_presence_of :embed_url
validates_uniqueness_of :embed_url
class FetchResponse
attr_accessor :title, :body, :author
end
def self.normalize_url(url)
url.downcase.sub(/\/$/, '').sub(/\-+/, '-').strip
end
@ -76,18 +80,28 @@ class TopicEmbed < ActiveRecord::Base
opts[:blacklist] = SiteSetting.embed_blacklist_selector if SiteSetting.embed_blacklist_selector.present?
embed_classname_whitelist = SiteSetting.embed_classname_whitelist if SiteSetting.embed_classname_whitelist.present?
doc = Readability::Document.new(open(url).read, opts)
response = FetchResponse.new
html = open(url).read
tags = {'img' => 'src', 'script' => 'src', 'a' => 'href'}
title = doc.title || ''
raw_doc = Nokogiri::HTML(html)
auth_element = raw_doc.at('meta[@name="author"]')
if auth_element.present?
response.author = User.where(username_lower: auth_element[:content].strip).first
end
read_doc = Readability::Document.new(html, opts)
title = raw_doc.title || ''
title.strip!
if SiteSetting.embed_title_scrubber.present?
title.sub!(Regexp.new(SiteSetting.embed_title_scrubber), '')
title.strip!
end
response.title = title
doc = Nokogiri::HTML(read_doc.content)
doc = Nokogiri::HTML(doc.content)
tags = {'img' => 'src', 'script' => 'src', 'a' => 'href'}
doc.search(tags.keys.join(',')).each do |node|
url_param = tags[node.name]
src = node[url_param]
@ -115,13 +129,17 @@ class TopicEmbed < ActiveRecord::Base
end
end
[title, doc.to_html]
response.body = doc.to_html
response
end
def self.import_remote(user, url, opts=nil)
def self.import_remote(import_user, url, opts=nil)
opts = opts || {}
title, body = find_remote(url)
TopicEmbed.import(user, url, opts[:title] || title, body)
response = find_remote(url)
response.title = opts[:title] if opts[:title].present?
import_user = response.author if response.author.present?
TopicEmbed.import(import_user, url, response.title, response.body)
end
# Convert any relative URLs to absolute. RSS is annoying for this.
@ -171,7 +189,9 @@ class TopicEmbed < ActiveRecord::Base
def self.expanded_for(post)
Rails.cache.fetch("embed-topic:#{post.topic_id}", expires_in: 10.minutes) do
url = TopicEmbed.where(topic_id: post.topic_id).pluck(:embed_url).first
_title, body = TopicEmbed.find_remote(url)
response = TopicEmbed.find_remote(url)
body = response.body
body << TopicEmbed.imported_from_html(url)
body
end

View File

@ -75,51 +75,73 @@ describe TopicEmbed do
end
it "doesn't scrub the title by default" do
title, _ = TopicEmbed.find_remote(url)
expect(title).to eq("Through the Looking Glass - Classic Books")
response = TopicEmbed.find_remote(url)
expect(response.title).to eq("Through the Looking Glass - Classic Books")
end
it "scrubs the title when the option is enabled" do
SiteSetting.embed_title_scrubber = " - Classic Books$"
title, _ = TopicEmbed.find_remote(url)
expect(title).to eq("Through the Looking Glass")
response = TopicEmbed.find_remote(url)
expect(response.title).to eq("Through the Looking Glass")
end
end
context 'post with allowed classes "foo" and "emoji"' do
let(:user) { Fabricate(:user) }
let(:url) { 'http://eviltrout.com/123' }
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
let!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
content = ''
response = nil
before(:each) do
SiteSetting.stubs(:embed_classname_whitelist).returns 'emoji , foo'
file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file
_, content = TopicEmbed.find_remote(url)
response = TopicEmbed.find_remote(url)
end
it "has no author tag" do
expect(response.author).to be_blank
end
it 'img node has emoji class' do
expect(content).to have_tag('img', with: { class: 'emoji' })
expect(response.body).to have_tag('img', with: { class: 'emoji' })
end
it 'img node has foo class' do
expect(content).to have_tag('img', with: { class: 'foo' })
expect(response.body).to have_tag('img', with: { class: 'foo' })
end
it 'p node has foo class' do
expect(content).to have_tag('p', with: { class: 'foo' })
expect(response.body).to have_tag('p', with: { class: 'foo' })
end
it 'nodes removes classes other than emoji' do
expect(content).to have_tag('img', without: { class: 'other' })
expect(response.body).to have_tag('img', without: { class: 'other' })
end
end
context 'post with author metadata' do
let!(:user) { Fabricate(:user, username: 'eviltrout') }
let(:url) { 'http://eviltrout.com/321' }
let(:contents) { '<html><head><meta name="author" content="eviltrout"></head><body>rich and morty</body></html>' }
let!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
response = nil
before(:each) do
file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file
response = TopicEmbed.find_remote(url)
end
it "has no author tag" do
expect(response.author).to eq(user)
end
end
context 'post with no allowed classes' do
@ -130,29 +152,29 @@ describe TopicEmbed do
let!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
content = ''
response = nil
before(:each) do
SiteSetting.stubs(:embed_classname_whitelist).returns ' '
file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file
_, content = TopicEmbed.find_remote(url)
response = TopicEmbed.find_remote(url)
end
it 'img node doesn\'t have emoji class' do
expect(content).to have_tag('img', without: { class: 'emoji' })
expect(response.body).to have_tag('img', without: { class: 'emoji' })
end
it 'img node doesn\'t have foo class' do
expect(content).to have_tag('img', without: { class: 'foo' })
expect(response.body).to have_tag('img', without: { class: 'foo' })
end
it 'p node doesn\'t foo class' do
expect(content).to have_tag('p', without: { class: 'foo' })
expect(response.body).to have_tag('p', without: { class: 'foo' })
end
it 'img node doesn\'t have other class' do
expect(content).to have_tag('img', without: { class: 'other' })
expect(response.body).to have_tag('img', without: { class: 'other' })
end
end