FEATURE: use canonical links in posts.rss feed (#16190)

* FEATURE: use canonical links in posts.rss feed

Previously we used non canonical links in posts.rss

These links get crawled frequently by crawlers when discovering new
content forcing crawlers to hop to non canonical pages just to end up
visiting canonical pages

This uses up expensive crawl time and adds load on Discourse sites

Old links were of the form:

`https://DOMAIN/t/SLUG/43/21`

New links are of the form

`https://DOMAIN/t/SLUG/43?page=2#post_21`

This also adds a post_id identified element to crawler view that was
missing.

Note, to avoid very expensive N+1 queries required to figure out the
page a post is on during rss generation, we cache that information.

There is a smart "cache breaker" which ensures worst case scenario is
a "page drift" - meaning we would publicize a post is on page 11 when
it is actually on page 10 due to post deletions. Cache holds for up to
12 hours.

Change only impacts public post RSS feeds (`/posts.rss`)
This commit is contained in:
Sam 2022-03-15 20:17:06 +11:00 committed by GitHub
parent 8664712c1a
commit de9a031073
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 90 additions and 4 deletions

View File

@ -84,6 +84,7 @@ class PostsController < ApplicationController
.includes(:reply_to_user)
.limit(50)
rss_description = I18n.t("rss_description.posts")
@use_canonical = true
end
# Remove posts the user doesn't have permission to see

View File

@ -0,0 +1,39 @@
# frozen_string_literal: true
module PostsHelper
include ApplicationHelper
CACHE_URL_DURATION = 12.hours.to_i
def self.clear_canonical_cache!(post)
key = canonical_redis_key(post)
Discourse.redis.del(key)
end
def self.canonical_redis_key(post)
"post_canonical_url_#{post.id}"
end
def cached_post_url(post, use_canonical:)
if use_canonical
# this is very expensive to calculate page, we cache it for 12 hours
key = PostsHelper.canonical_redis_key(post)
url = Discourse.redis.get(key)
# break cache if either slug or topic_id changes
if url && !url.start_with?(post.topic.url)
url = nil
end
if !url
url = post.canonical_url
Discourse.redis.setex(key, CACHE_URL_DURATION, url)
end
url
else
post.full_url
end
end
end

View File

@ -610,6 +610,18 @@ class Post < ActiveRecord::Base
end
end
def canonical_url
topic_view = TopicView.new(topic, nil, post_number: post_number)
page = ""
if topic_view.page > 1
page = "?page=#{topic_view.page}"
end
"#{topic.url}#{page}#post_#{post_number}"
end
def unsubscribe_url(user)
"#{Discourse.base_url}/email/unsubscribe/#{UnsubscribeKey.create_key_for(user, self)}"
end

View File

@ -12,7 +12,7 @@
<title><%= post.topic.title %></title>
<dc:creator><![CDATA[<%= "@#{post.user.username}#{" #{post.user.name}" if (post.user.name.present? && SiteSetting.enable_names?)}" -%>]]></dc:creator>
<description><![CDATA[ <%= PrettyText.format_for_email(post.cooked, post).html_safe %> ]]></description>
<link><%= Discourse.base_url + post.url %></link>
<link><%= cached_post_url(post, use_canonical: @use_canonical) %></link>
<pubDate><%= post.created_at.rfc2822 %></pubDate>
<guid isPermaLink="false"><%= Discourse.current_hostname %>-post-<%= post.id %></guid>
</item>

View File

@ -37,7 +37,7 @@
<% @topic_view.posts.each_with_index do |post, idx| %>
<% if (u = post.user) %>
<div itemscope itemtype='http://schema.org/DiscussionForumPosting' class='topic-body crawler-post'>
<div id='post_<%= post.post_number %>' itemscope itemtype='http://schema.org/DiscussionForumPosting' class='topic-body crawler-post'>
<div class='crawler-post-meta'>
<div itemprop='publisher' itemscope itemtype="http://schema.org/Organization">
<meta itemprop='name' content='<%= SiteSetting.company_name.presence || SiteSetting.title %>'>

View File

@ -33,7 +33,8 @@ class TopicView
:message_bus_last_id,
:queued_posts_enabled,
:personal_message,
:can_review_topic
:can_review_topic,
:page
)
alias queued_posts_enabled? queued_posts_enabled

View File

@ -1811,4 +1811,28 @@ describe Post do
expect(post.cannot_permanently_delete_reason(Fabricate(:admin))).to eq(nil)
end
end
describe "#canonical_url" do
it 'is able to determine correct canonical urls' do
# ugly, but no interface to set this and we don't want to create
# 100 posts to test this thing
TopicView.stubs(:chunk_size).returns(2)
post1 = Fabricate(:post)
topic = post1.topic
post2 = Fabricate(:post, topic: topic)
post3 = Fabricate(:post, topic: topic)
post4 = Fabricate(:post, topic: topic)
topic_url = post1.topic.url
expect(post1.canonical_url).to eq("#{topic_url}#post_#{post1.post_number}")
expect(post2.canonical_url).to eq("#{topic_url}#post_#{post2.post_number}")
expect(post3.canonical_url).to eq("#{topic_url}?page=2#post_#{post3.post_number}")
expect(post4.canonical_url).to eq("#{topic_url}?page=2#post_#{post4.post_number}")
end
end
end

View File

@ -2113,7 +2113,10 @@ describe PostsController do
body = response.body
expect(body).to include(public_post.url)
# we cache in redis, in rare cases this can cause a flaky test
PostsHelper.clear_canonical_cache!(public_post)
expect(body).to include(public_post.canonical_url)
expect(body).to_not include(private_post.url)
end

View File

@ -4329,6 +4329,9 @@ RSpec.describe TopicsController do
expect(body).to_not have_tag(:meta, with: { name: 'fragment' })
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=2")
expect(body).to include("id='post_1'")
expect(body).to include("id='post_2'")
expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent }
@ -4336,6 +4339,9 @@ RSpec.describe TopicsController do
expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
expect(body).to include("id='post_3'")
expect(body).to include("id='post_4'")
expect(body).to include('<link rel="prev" href="' + topic.relative_url)
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")