FEATURE: Replace SimpleRSS with Ruby RSS module (#5311)
* SPEC: PollFeedJob parsing atom feed * add FeedItemAccessor It is to provide a consistent interface to access a feed item's tag content. * add FeedElementInstaller to install non-standard and non-namespaced feed elements * FEATURE: replace SimpleRSS with Ruby RSS module * get FinalDestination and download with Excon * support namespaced element with FeedElementInstaller
This commit is contained in:
parent
410994b7f5
commit
5f318a5241
1
Gemfile
1
Gemfile
|
@ -166,7 +166,6 @@ gem 'gc_tracer', require: false, platform: :mri
|
|||
|
||||
# required for feed importing and embedding
|
||||
gem 'ruby-readability', require: false
|
||||
gem 'simple-rss', require: false
|
||||
|
||||
gem 'stackprof', require: false, platform: :mri
|
||||
gem 'memory_profiler', require: false, platform: :mri
|
||||
|
|
|
@ -368,7 +368,6 @@ GEM
|
|||
connection_pool (~> 2.2, >= 2.2.0)
|
||||
rack-protection (>= 1.5.0)
|
||||
redis (>= 3.3.4, < 5)
|
||||
simple-rss (1.3.1)
|
||||
slop (3.6.0)
|
||||
sprockets (3.7.1)
|
||||
concurrent-ruby (~> 1.0)
|
||||
|
@ -500,7 +499,6 @@ DEPENDENCIES
|
|||
seed-fu
|
||||
shoulda
|
||||
sidekiq
|
||||
simple-rss
|
||||
sprockets-rails
|
||||
stackprof
|
||||
thor
|
||||
|
|
|
@ -2,9 +2,13 @@
|
|||
# Creates and Updates Topics based on an RSS or ATOM feed.
|
||||
#
|
||||
require 'digest/sha1'
|
||||
require 'excon'
|
||||
require 'rss'
|
||||
require_dependency 'feed_item_accessor'
|
||||
require_dependency 'feed_element_installer'
|
||||
require_dependency 'final_destination'
|
||||
require_dependency 'post_creator'
|
||||
require_dependency 'post_revisor'
|
||||
require 'open-uri'
|
||||
|
||||
module Jobs
|
||||
class PollFeed < Jobs::Scheduled
|
||||
|
@ -46,17 +50,11 @@ module Jobs
|
|||
|
||||
def import_topic(topic)
|
||||
if topic.user
|
||||
TopicEmbed.import(topic.user, topic.url, topic.title, CGI.unescapeHTML(topic.content.scrub))
|
||||
TopicEmbed.import(topic.user, topic.url, topic.title, CGI.unescapeHTML(topic.content))
|
||||
end
|
||||
end
|
||||
|
||||
class Feed
|
||||
require 'simple-rss'
|
||||
|
||||
if SiteSetting.embed_username_key_from_feed.present?
|
||||
SimpleRSS.item_tags << SiteSetting.embed_username_key_from_feed.to_sym
|
||||
end
|
||||
|
||||
def initialize
|
||||
@feed_url = SiteSetting.feed_polling_url
|
||||
@feed_url = "http://#{@feed_url}" if @feed_url !~ /^https?\:\/\//
|
||||
|
@ -65,7 +63,7 @@ module Jobs
|
|||
def topics
|
||||
feed_topics = []
|
||||
|
||||
rss = fetch_rss
|
||||
rss = parsed_feed
|
||||
return feed_topics unless rss.present?
|
||||
|
||||
rss.items.each do |i|
|
||||
|
@ -78,36 +76,56 @@ module Jobs
|
|||
|
||||
private
|
||||
|
||||
def fetch_rss
|
||||
SimpleRSS.parse open(@feed_url, allow_redirections: :all)
|
||||
rescue OpenURI::HTTPError, SimpleRSSError
|
||||
def parsed_feed
|
||||
raw_feed = fetch_rss
|
||||
return nil if raw_feed.blank?
|
||||
|
||||
if SiteSetting.embed_username_key_from_feed.present?
|
||||
FeedElementInstaller.install(SiteSetting.embed_username_key_from_feed, raw_feed)
|
||||
end
|
||||
|
||||
RSS::Parser.parse(raw_feed)
|
||||
rescue RSS::NotWellFormedError, RSS::InvalidRSSError
|
||||
nil
|
||||
end
|
||||
|
||||
def fetch_rss
|
||||
final_destination = FinalDestination.new(@feed_url, verbose: true)
|
||||
feed_final_url = final_destination.resolve
|
||||
return nil unless final_destination.status == :resolved
|
||||
|
||||
Excon.new(feed_final_url.to_s).request(method: :get, expects: 200).body
|
||||
rescue Excon::Error::HTTPStatus
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
class FeedTopic
|
||||
def initialize(article_rss_item)
|
||||
@article_rss_item = article_rss_item
|
||||
@accessor = FeedItemAccessor.new(article_rss_item)
|
||||
end
|
||||
|
||||
def url
|
||||
link = @article_rss_item.link
|
||||
link = @accessor.link
|
||||
if url?(link)
|
||||
return link
|
||||
else
|
||||
return @article_rss_item.id
|
||||
return @accessor.element_content(:id)
|
||||
end
|
||||
end
|
||||
|
||||
def content
|
||||
@article_rss_item.content_encoded&.force_encoding("UTF-8")&.scrub ||
|
||||
@article_rss_item.content&.force_encoding("UTF-8")&.scrub ||
|
||||
@article_rss_item.description&.force_encoding("UTF-8")&.scrub
|
||||
content = nil
|
||||
|
||||
%i[content_encoded content description].each do |content_element_name|
|
||||
content ||= @accessor.element_content(content_element_name)
|
||||
end
|
||||
|
||||
content&.force_encoding('UTF-8')&.scrub
|
||||
end
|
||||
|
||||
def title
|
||||
@article_rss_item.title.force_encoding("UTF-8").scrub
|
||||
@accessor.element_content(:title).force_encoding('UTF-8').scrub
|
||||
end
|
||||
|
||||
def user
|
||||
|
@ -125,11 +143,7 @@ module Jobs
|
|||
end
|
||||
|
||||
def author_username
|
||||
begin
|
||||
@article_rss_item.send(SiteSetting.embed_username_key_from_feed.to_sym)
|
||||
rescue
|
||||
nil
|
||||
end
|
||||
@accessor.element_content(SiteSetting.embed_username_key_from_feed.sub(':', '_'))
|
||||
end
|
||||
|
||||
def default_user
|
||||
|
@ -145,9 +159,6 @@ module Jobs
|
|||
def find_user(user_name)
|
||||
User.where(username_lower: user_name).first
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
require 'rexml/document'
|
||||
require 'rss'
|
||||
|
||||
class FeedElementInstaller
|
||||
private_class_method :new
|
||||
|
||||
def self.install(element_name, feed)
|
||||
# RSS Specification at http://cyber.harvard.edu/rss/rss.html#extendingRss
|
||||
# > A RSS feed may contain [non-standard elements], only if those elements are *defined in a namespace*
|
||||
|
||||
new(element_name, feed).install if element_name.include?(':')
|
||||
end
|
||||
|
||||
attr_reader :feed, :original_name, :element_namespace, :element_name, :element_accessor
|
||||
|
||||
def initialize(element_name, feed)
|
||||
@feed = feed
|
||||
@original_name = element_name
|
||||
@element_namespace, @element_name = *element_name.split(':')
|
||||
@element_accessor = "#{@element_namespace}_#{@element_name}"
|
||||
end
|
||||
|
||||
def element_uri
|
||||
@element_uri ||= REXML::Document.new(feed).root&.attributes&.namespaces&.fetch(@element_namespace, '') || ''
|
||||
end
|
||||
|
||||
def install
|
||||
install_in_rss unless installed_in_rss?
|
||||
install_in_atom unless installed_in_atom?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def install_in_rss
|
||||
RSS::Rss::Channel::Item.install_text_element(element_name, element_uri, '?', element_accessor, nil, original_name)
|
||||
RSS::BaseListener.install_get_text_element(element_uri, element_name, element_accessor)
|
||||
end
|
||||
|
||||
def install_in_atom
|
||||
RSS::Atom::Entry.install_text_element(element_name, element_uri, '?', element_accessor, nil, original_name)
|
||||
RSS::Atom::Feed::Entry.install_text_element(element_name, element_uri, '?', element_accessor, nil, original_name)
|
||||
RSS::BaseListener.install_get_text_element(element_uri, element_name, element_accessor)
|
||||
end
|
||||
|
||||
def installed_in_rss?
|
||||
RSS::Rss::Channel::Item.method_defined?(element_accessor)
|
||||
end
|
||||
|
||||
def installed_in_atom?
|
||||
RSS::Atom::Entry.method_defined?(element_accessor) || RSS::Atom::Feed::Entry.method_defined?(element_accessor)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,25 @@
|
|||
class FeedItemAccessor
|
||||
attr_accessor :rss_item
|
||||
|
||||
def initialize(rss_item)
|
||||
@rss_item = rss_item
|
||||
end
|
||||
|
||||
def element_content(element_name)
|
||||
try_attribute_or_self(element(element_name), :content)
|
||||
end
|
||||
|
||||
def link
|
||||
try_attribute_or_self(element(:link), :href)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def element(element_name)
|
||||
rss_item.respond_to?(element_name) ? rss_item.send(element_name) : nil
|
||||
end
|
||||
|
||||
def try_attribute_or_self(element, attribute_name)
|
||||
element.respond_to?(attribute_name) ? element.send(attribute_name) : element
|
||||
end
|
||||
end
|
|
@ -0,0 +1,40 @@
|
|||
require 'feed_element_installer'
|
||||
require 'rails_helper'
|
||||
|
||||
describe FeedElementInstaller do
|
||||
describe '#install_rss_element' do
|
||||
let(:raw_feed) { file_from_fixtures('feed.rss', 'feed').read }
|
||||
|
||||
it 'creates parsing for a non-standard, namespaced element' do
|
||||
FeedElementInstaller.install('discourse:username', raw_feed)
|
||||
feed = RSS::Parser.parse(raw_feed)
|
||||
|
||||
expect(feed.items.first.discourse_username).to eq('xrav3nz')
|
||||
end
|
||||
|
||||
it 'does not create parsing for a non-standard, non-namespaced element' do
|
||||
FeedElementInstaller.install('username', raw_feed)
|
||||
feed = RSS::Parser.parse(raw_feed)
|
||||
|
||||
expect { feed.items.first.username }.to raise_error(NoMethodError)
|
||||
end
|
||||
end
|
||||
|
||||
describe '#install_atom_element' do
|
||||
let(:raw_feed) { file_from_fixtures('feed.atom', 'feed').read }
|
||||
|
||||
it 'creates parsing for a non-standard, namespaced element' do
|
||||
FeedElementInstaller.install('discourse:username', raw_feed)
|
||||
feed = RSS::Parser.parse(raw_feed)
|
||||
|
||||
expect(feed.items.first.discourse_username).to eq('xrav3nz')
|
||||
end
|
||||
|
||||
it 'does not create parsing for a non-standard, non-namespaced element' do
|
||||
FeedElementInstaller.install('username', raw_feed)
|
||||
feed = RSS::Parser.parse(raw_feed)
|
||||
|
||||
expect { feed.items.first.username }.to raise_error(NoMethodError)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,33 @@
|
|||
require 'rss'
|
||||
require 'feed_item_accessor'
|
||||
require 'rails_helper'
|
||||
|
||||
describe FeedItemAccessor do
|
||||
context 'for ATOM feed' do
|
||||
let(:atom_feed) { RSS::Parser.parse(file_from_fixtures('feed.atom', 'feed'), false) }
|
||||
let(:atom_feed_item) { atom_feed.items.first }
|
||||
let(:item_accessor) { FeedItemAccessor.new(atom_feed_item) }
|
||||
|
||||
describe '#element_content' do
|
||||
it { expect(item_accessor.element_content('title')).to eq(atom_feed_item.title.content) }
|
||||
end
|
||||
|
||||
describe '#link' do
|
||||
it { expect(item_accessor.link).to eq(atom_feed_item.link.href) }
|
||||
end
|
||||
end
|
||||
|
||||
context 'for RSS feed' do
|
||||
let(:rss_feed) { RSS::Parser.parse(file_from_fixtures('feed.rss', 'feed'), false) }
|
||||
let(:rss_feed_item) { rss_feed.items.first }
|
||||
let(:item_accessor) { FeedItemAccessor.new(rss_feed_item) }
|
||||
|
||||
describe '#element_content' do
|
||||
it { expect(item_accessor.element_content('title')).to eq(rss_feed_item.title) }
|
||||
end
|
||||
|
||||
describe '#link' do
|
||||
it { expect(item_accessor.link).to eq(rss_feed_item.link) }
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,30 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed
|
||||
xmlns="http://www.w3.org/2005/Atom"
|
||||
xmlns:thr="http://purl.org/syndication/thread/1.0"
|
||||
xmlns:discourse="http://discourse.org/rss/modules/discourse/"
|
||||
xml:lang="en-US"
|
||||
xml:base="https://blog.discourse.org/wp-atom.php"
|
||||
>
|
||||
<title type="text">Discourse</title>
|
||||
<subtitle type="text">Official blog for the open source Discourse project</subtitle>
|
||||
<updated>2017-10-23T23:45:37Z</updated>
|
||||
<link rel="alternate" type="text/html" href="https://blog.discourse.org" />
|
||||
<id>https://blog.discourse.org/feed/atom/</id>
|
||||
<link rel="self" type="application/atom+xml" href="https://blog.discourse.org/feed/atom/" />
|
||||
<generator uri="https://wordpress.org/" version="4.8.2">WordPress</generator>
|
||||
<entry>
|
||||
<discourse:username><![CDATA[xrav3nz]]></discourse:username>
|
||||
<author>
|
||||
<name>xrav3nz</name>
|
||||
</author>
|
||||
<title type="html"><![CDATA[Poll Feed Spec Fixture]]></title>
|
||||
<link rel="alternate" type="text/html" href="https://blog.discourse.org/2017/09/poll-feed-spec-fixture/" />
|
||||
<id>https://blog.discourse.org/?p=pollfeedspec</id>
|
||||
<updated>2017-09-14T15:22:33Z</updated>
|
||||
<published>2017-09-14T15:22:33Z</published>
|
||||
<category scheme="https://blog.discourse.org" term="design" />
|
||||
<summary type="html"><![CDATA[Here are some random descriptions... […]]]></summary>
|
||||
<content type="html" xml:base="https://blog.discourse.org/2017/09/poll-feed-spec-fixture/"><![CDATA[<p>This is the body & content. </p>]]></content>
|
||||
</entry>
|
||||
</feed>
|
|
@ -5,6 +5,7 @@
|
|||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
|
||||
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
|
||||
xmlns:discourse="http://discourse.org/rss/modules/discourse/"
|
||||
>
|
||||
<channel>
|
||||
<title>Discourse</title>
|
||||
|
@ -21,6 +22,7 @@
|
|||
<link>https://blog.discourse.org/2017/09/poll-feed-spec-fixture/</link>
|
||||
<pubDate>Thu, 14 Sep 2017 15:22:33 +0000</pubDate>
|
||||
<dc:creator><![CDATA[xrav3nz]]></dc:creator>
|
||||
<discourse:username><![CDATA[xrav3nz]]></discourse:username>
|
||||
<category><![CDATA[spec]]></category>
|
||||
<guid isPermaLink="false">https://blog.discourse.org/?p=pollfeedspec</guid>
|
||||
<description><![CDATA[Here are some random descriptions... […]]]></description>
|
||||
|
|
|
@ -40,76 +40,91 @@ describe Jobs::PollFeed do
|
|||
poller.execute({})
|
||||
poller.execute({})
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
describe '#poll_feed' do
|
||||
let(:embed_by_username) { 'eviltrout' }
|
||||
let(:embed_username_key_from_feed) { 'dc_creator' }
|
||||
let(:embed_username_key_from_feed) { 'discourse:username' }
|
||||
let!(:default_user) { Fabricate(:evil_trout) }
|
||||
let!(:feed_author) { Fabricate(:user, username: 'xrav3nz', email: 'hi@bye.com') }
|
||||
|
||||
before do
|
||||
SiteSetting.feed_polling_enabled = true
|
||||
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/'
|
||||
SiteSetting.embed_by_username = embed_by_username
|
||||
shared_examples 'topic creation based on the the feed' do
|
||||
describe 'author username parsing' do
|
||||
context 'when neither embed_by_username nor embed_username_key_from_feed is set' do
|
||||
before do
|
||||
SiteSetting.embed_by_username = ""
|
||||
SiteSetting.embed_username_key_from_feed = ""
|
||||
end
|
||||
|
||||
stub_request(:get, SiteSetting.feed_polling_url).to_return(
|
||||
status: 200,
|
||||
body: file_from_fixtures('feed.rss', 'feed').read,
|
||||
headers: { "Content-Type" => "application/rss+xml" }
|
||||
)
|
||||
end
|
||||
|
||||
describe 'author username parsing' do
|
||||
context 'when neither embed_by_username nor embed_username_key_from_feed is set' do
|
||||
before do
|
||||
SiteSetting.embed_by_username = ""
|
||||
SiteSetting.embed_username_key_from_feed = ""
|
||||
it 'does not import topics' do
|
||||
expect { poller.poll_feed }.not_to change { Topic.count }
|
||||
end
|
||||
end
|
||||
|
||||
it 'does not import topics' do
|
||||
expect { poller.poll_feed }.not_to change { Topic.count }
|
||||
context 'when embed_by_username is set' do
|
||||
before do
|
||||
SiteSetting.embed_by_username = embed_by_username
|
||||
SiteSetting.embed_username_key_from_feed = ""
|
||||
end
|
||||
|
||||
it 'creates the new topics under embed_by_username' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.user).to eq(default_user)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when embed_username_key_from_feed is set' do
|
||||
before do
|
||||
SiteSetting.embed_username_key_from_feed = embed_username_key_from_feed
|
||||
end
|
||||
|
||||
it 'creates the new topics under the username found' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.user).to eq(feed_author)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'when embed_by_username is set' do
|
||||
before do
|
||||
SiteSetting.embed_by_username = embed_by_username
|
||||
SiteSetting.embed_username_key_from_feed = ""
|
||||
end
|
||||
|
||||
it 'creates the new topics under embed_by_username' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.user).to eq(default_user)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when embed_username_key_from_feed is set' do
|
||||
before do
|
||||
SiteSetting.embed_username_key_from_feed = embed_username_key_from_feed
|
||||
end
|
||||
|
||||
it 'creates the new topics under the username found' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.user).to eq(feed_author)
|
||||
end
|
||||
it 'parses creates a new post correctly' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.title).to eq('Poll Feed Spec Fixture')
|
||||
expect(Topic.last.first_post.raw).to include('<p>This is the body & content. </p>')
|
||||
expect(Topic.last.topic_embed.embed_url).to eq('https://blog.discourse.org/2017/09/poll-feed-spec-fixture')
|
||||
end
|
||||
end
|
||||
|
||||
it 'parses the title correctly' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.title).to eq('Poll Feed Spec Fixture')
|
||||
context 'when parsing RSS feed' do
|
||||
before do
|
||||
SiteSetting.feed_polling_enabled = true
|
||||
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/'
|
||||
SiteSetting.embed_by_username = embed_by_username
|
||||
|
||||
stub_request(:head, SiteSetting.feed_polling_url).to_return(status: 200)
|
||||
stub_request(:get, SiteSetting.feed_polling_url).to_return(
|
||||
status: 200,
|
||||
body: file_from_fixtures('feed.rss', 'feed').read,
|
||||
headers: { "Content-Type" => "application/rss+xml" }
|
||||
)
|
||||
end
|
||||
|
||||
include_examples 'topic creation based on the the feed'
|
||||
end
|
||||
|
||||
it 'parses the content correctly' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.first_post.raw).to include('<p>This is the body & content. </p>')
|
||||
end
|
||||
context 'when parsing ATOM feed' do
|
||||
before do
|
||||
SiteSetting.feed_polling_enabled = true
|
||||
SiteSetting.feed_polling_url = 'https://blog.discourse.org/feed/atom/'
|
||||
SiteSetting.embed_by_username = embed_by_username
|
||||
|
||||
it 'parses the link correctly' do
|
||||
expect { poller.poll_feed }.to change { Topic.count }.by(1)
|
||||
expect(Topic.last.topic_embed.embed_url).to eq('https://blog.discourse.org/2017/09/poll-feed-spec-fixture')
|
||||
stub_request(:head, SiteSetting.feed_polling_url).to_return(status: 200)
|
||||
stub_request(:get, SiteSetting.feed_polling_url).to_return(
|
||||
status: 200,
|
||||
body: file_from_fixtures('feed.atom', 'feed').read,
|
||||
headers: { "Content-Type" => "application/atom+xml" }
|
||||
)
|
||||
end
|
||||
|
||||
include_examples 'topic creation based on the the feed'
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue