FEATURE: use canonical links in posts.rss feed (#16190)
* FEATURE: use canonical links in posts.rss feed Previously we used non canonical links in posts.rss These links get crawled frequently by crawlers when discovering new content forcing crawlers to hop to non canonical pages just to end up visiting canonical pages This uses up expensive crawl time and adds load on Discourse sites Old links were of the form: `https://DOMAIN/t/SLUG/43/21` New links are of the form `https://DOMAIN/t/SLUG/43?page=2#post_21` This also adds a post_id identified element to crawler view that was missing. Note, to avoid very expensive N+1 queries required to figure out the page a post is on during rss generation, we cache that information. There is a smart "cache breaker" which ensures worst case scenario is a "page drift" - meaning we would publicize a post is on page 11 when it is actually on page 10 due to post deletions. Cache holds for up to 12 hours. Change only impacts public post RSS feeds (`/posts.rss`)
This commit is contained in:
parent
8664712c1a
commit
de9a031073
|
@ -84,6 +84,7 @@ class PostsController < ApplicationController
|
|||
.includes(:reply_to_user)
|
||||
.limit(50)
|
||||
rss_description = I18n.t("rss_description.posts")
|
||||
@use_canonical = true
|
||||
end
|
||||
|
||||
# Remove posts the user doesn't have permission to see
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
module PostsHelper
|
||||
include ApplicationHelper
|
||||
|
||||
CACHE_URL_DURATION = 12.hours.to_i
|
||||
|
||||
def self.clear_canonical_cache!(post)
|
||||
key = canonical_redis_key(post)
|
||||
Discourse.redis.del(key)
|
||||
end
|
||||
|
||||
def self.canonical_redis_key(post)
|
||||
"post_canonical_url_#{post.id}"
|
||||
end
|
||||
|
||||
def cached_post_url(post, use_canonical:)
|
||||
if use_canonical
|
||||
# this is very expensive to calculate page, we cache it for 12 hours
|
||||
key = PostsHelper.canonical_redis_key(post)
|
||||
|
||||
url = Discourse.redis.get(key)
|
||||
|
||||
# break cache if either slug or topic_id changes
|
||||
if url && !url.start_with?(post.topic.url)
|
||||
url = nil
|
||||
end
|
||||
|
||||
if !url
|
||||
url = post.canonical_url
|
||||
Discourse.redis.setex(key, CACHE_URL_DURATION, url)
|
||||
end
|
||||
|
||||
url
|
||||
else
|
||||
post.full_url
|
||||
end
|
||||
end
|
||||
end
|
|
@ -610,6 +610,18 @@ class Post < ActiveRecord::Base
|
|||
end
|
||||
end
|
||||
|
||||
def canonical_url
|
||||
topic_view = TopicView.new(topic, nil, post_number: post_number)
|
||||
|
||||
page = ""
|
||||
|
||||
if topic_view.page > 1
|
||||
page = "?page=#{topic_view.page}"
|
||||
end
|
||||
|
||||
"#{topic.url}#{page}#post_#{post_number}"
|
||||
end
|
||||
|
||||
def unsubscribe_url(user)
|
||||
"#{Discourse.base_url}/email/unsubscribe/#{UnsubscribeKey.create_key_for(user, self)}"
|
||||
end
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
<title><%= post.topic.title %></title>
|
||||
<dc:creator><![CDATA[<%= "@#{post.user.username}#{" #{post.user.name}" if (post.user.name.present? && SiteSetting.enable_names?)}" -%>]]></dc:creator>
|
||||
<description><![CDATA[ <%= PrettyText.format_for_email(post.cooked, post).html_safe %> ]]></description>
|
||||
<link><%= Discourse.base_url + post.url %></link>
|
||||
<link><%= cached_post_url(post, use_canonical: @use_canonical) %></link>
|
||||
<pubDate><%= post.created_at.rfc2822 %></pubDate>
|
||||
<guid isPermaLink="false"><%= Discourse.current_hostname %>-post-<%= post.id %></guid>
|
||||
</item>
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
|
||||
<% @topic_view.posts.each_with_index do |post, idx| %>
|
||||
<% if (u = post.user) %>
|
||||
<div itemscope itemtype='http://schema.org/DiscussionForumPosting' class='topic-body crawler-post'>
|
||||
<div id='post_<%= post.post_number %>' itemscope itemtype='http://schema.org/DiscussionForumPosting' class='topic-body crawler-post'>
|
||||
<div class='crawler-post-meta'>
|
||||
<div itemprop='publisher' itemscope itemtype="http://schema.org/Organization">
|
||||
<meta itemprop='name' content='<%= SiteSetting.company_name.presence || SiteSetting.title %>'>
|
||||
|
|
|
@ -33,7 +33,8 @@ class TopicView
|
|||
:message_bus_last_id,
|
||||
:queued_posts_enabled,
|
||||
:personal_message,
|
||||
:can_review_topic
|
||||
:can_review_topic,
|
||||
:page
|
||||
)
|
||||
alias queued_posts_enabled? queued_posts_enabled
|
||||
|
||||
|
|
|
@ -1811,4 +1811,28 @@ describe Post do
|
|||
expect(post.cannot_permanently_delete_reason(Fabricate(:admin))).to eq(nil)
|
||||
end
|
||||
end
|
||||
|
||||
describe "#canonical_url" do
|
||||
it 'is able to determine correct canonical urls' do
|
||||
|
||||
# ugly, but no interface to set this and we don't want to create
|
||||
# 100 posts to test this thing
|
||||
TopicView.stubs(:chunk_size).returns(2)
|
||||
|
||||
post1 = Fabricate(:post)
|
||||
topic = post1.topic
|
||||
|
||||
post2 = Fabricate(:post, topic: topic)
|
||||
post3 = Fabricate(:post, topic: topic)
|
||||
post4 = Fabricate(:post, topic: topic)
|
||||
|
||||
topic_url = post1.topic.url
|
||||
|
||||
expect(post1.canonical_url).to eq("#{topic_url}#post_#{post1.post_number}")
|
||||
expect(post2.canonical_url).to eq("#{topic_url}#post_#{post2.post_number}")
|
||||
|
||||
expect(post3.canonical_url).to eq("#{topic_url}?page=2#post_#{post3.post_number}")
|
||||
expect(post4.canonical_url).to eq("#{topic_url}?page=2#post_#{post4.post_number}")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -2113,7 +2113,10 @@ describe PostsController do
|
|||
|
||||
body = response.body
|
||||
|
||||
expect(body).to include(public_post.url)
|
||||
# we cache in redis, in rare cases this can cause a flaky test
|
||||
PostsHelper.clear_canonical_cache!(public_post)
|
||||
|
||||
expect(body).to include(public_post.canonical_url)
|
||||
expect(body).to_not include(private_post.url)
|
||||
end
|
||||
|
||||
|
|
|
@ -4329,6 +4329,9 @@ RSpec.describe TopicsController do
|
|||
expect(body).to_not have_tag(:meta, with: { name: 'fragment' })
|
||||
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=2")
|
||||
|
||||
expect(body).to include("id='post_1'")
|
||||
expect(body).to include("id='post_2'")
|
||||
|
||||
expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
|
||||
|
||||
get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent }
|
||||
|
@ -4336,6 +4339,9 @@ RSpec.describe TopicsController do
|
|||
|
||||
expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
|
||||
|
||||
expect(body).to include("id='post_3'")
|
||||
expect(body).to include("id='post_4'")
|
||||
|
||||
expect(body).to include('<link rel="prev" href="' + topic.relative_url)
|
||||
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
|
||||
|
||||
|
|
Loading…
Reference in New Issue