FEATURE: Let sites add a sitemap.xml file. (#16357)

* FEATURE: Let sites add a sitemap.xml file.

This PR adds the same features discourse-sitemap provides to core. Sitemaps are only added to the robots.txt file if the `enable_sitemap` setting is enabled and `login_required` disabled.

After merging discourse/discourse-sitemap#34, this change will take priority over the sitemap plugin because it will disable itself. We're also using the same sitemaps table, so our migration won't try to create it
again using `if_not_exists: true`.
This commit is contained in:
Roman Rizzi 2022-04-12 10:33:59 -03:00 committed by GitHub
parent 9c33f6de05
commit 6f76a12e0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 576 additions and 0 deletions

View File

@ -0,0 +1,71 @@
# frozen_string_literal: true
class SitemapController < ApplicationController
layout false
skip_before_action :preload_json, :check_xhr
before_action :check_sitemap_enabled
def index
@sitemaps = Sitemap
.where(enabled: true)
.where.not(name: Sitemap::NEWS_SITEMAP_NAME)
render :index
end
def page
index = params.require(:page)
sitemap = Sitemap.find_by(enabled: true, name: index.to_s)
raise Discourse::NotFound if sitemap.nil?
@output = Rails.cache.fetch("sitemap/#{sitemap.name}/#{sitemap.max_page_size}", expires_in: 24.hours) do
@topics = sitemap.topics
render :page, content_type: 'text/xml; charset=UTF-8'
end
render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed?
end
def recent
sitemap = Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME)
@output = Rails.cache.fetch("sitemap/recent/#{sitemap.last_posted_at.to_i}", expires_in: 1.hour) do
@topics = sitemap.topics
render :page, content_type: 'text/xml; charset=UTF-8'
end
render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed?
end
def news
sitemap = Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME)
@output = Rails.cache.fetch("sitemap/news", expires_in: 5.minutes) do
dlocale = SiteSetting.default_locale.downcase
@locale = dlocale.gsub(/_.*/, '')
@locale = dlocale.sub('_', '-') if @locale === "zh"
@topics = sitemap.topics
render :news, content_type: 'text/xml; charset=UTF-8'
end
render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed?
end
private
def check_sitemap_enabled
raise Discourse::NotFound if !SiteSetting.enable_sitemap
end
def build_sitemap_topic_url(slug, id, posts_count = nil)
base_url = [Discourse.base_url, 't', slug, id].join('/')
return base_url if posts_count.nil?
page, mod = posts_count.divmod(TopicView.chunk_size)
page += 1 if mod > 0
page > 1 ? "#{base_url}?page=#{page}" : base_url
end
helper_method :build_sitemap_topic_url
end

View File

@ -0,0 +1,11 @@
# frozen_string_literal: true
module Jobs
class RegenerateSitemaps < ::Jobs::Scheduled
every 1.hour
def execute(_args)
Sitemap.regenerate_sitemaps if SiteSetting.enable_sitemap?
end
end
end

87
app/models/sitemap.rb Normal file
View File

@ -0,0 +1,87 @@
# frozen_string_literal: true
class Sitemap < ActiveRecord::Base
RECENT_SITEMAP_NAME = 'recent'
NEWS_SITEMAP_NAME = 'news'
class << self
def regenerate_sitemaps
names_used = [RECENT_SITEMAP_NAME, NEWS_SITEMAP_NAME]
names_used.each { |name| touch(name) }
count = Category.where(read_restricted: false).sum(:topic_count)
max_page_size = SiteSetting.sitemap_page_size
size, mod = count.divmod(max_page_size)
size += 1 if mod > 0
size.times do |index|
page_name = (index + 1).to_s
touch(page_name)
names_used << page_name
end
where.not(name: names_used).update_all(enabled: false)
end
def touch(name)
find_or_initialize_by(name: name).tap do |sitemap|
sitemap.update!(
last_posted_at: sitemap.last_posted_topic || 3.days.ago,
enabled: true
)
end
end
end
def topics
if name == RECENT_SITEMAP_NAME
sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at, :posts_count)
elsif name == NEWS_SITEMAP_NAME
sitemap_topics.pluck(:id, :title, :slug, :created_at)
else
sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at)
end
end
def last_posted_topic
sitemap_topics.maximum(:updated_at)
end
def max_page_size
SiteSetting.sitemap_page_size
end
private
def sitemap_topics
indexable_topics = Topic
.where(visible: true)
.joins(:category)
.where(categories: { read_restricted: false })
if name == RECENT_SITEMAP_NAME
indexable_topics.where('bumped_at > ?', 3.days.ago).order(bumped_at: :desc)
elsif name == NEWS_SITEMAP_NAME
indexable_topics.where('bumped_at > ?', 72.hours.ago).order(bumped_at: :desc)
else
offset = (name.to_i - 1) * max_page_size
indexable_topics.limit(max_page_size).offset(offset)
end
end
end
# == Schema Information
#
# Table name: sitemaps
#
# id :bigint not null, primary key
# name :string not null
# last_posted_at :datetime not null
# enabled :boolean default(TRUE), not null
#
# Indexes
#
# index_sitemaps_on_name (name) UNIQUE
#

View File

@ -12,4 +12,8 @@ Disallow: <%= path %>
<% end %>
<%- if SiteSetting.enable_sitemap? && !SiteSetting.login_required? %>
Sitemap: <%= request.protocol %><%= request.host_with_port %>/sitemap.xml
<% end %>
<%= server_plugin_outlet "robots_txt_index" %>

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<% @sitemaps.each do |sitemap| %>
<sitemap>
<loc><%= Discourse.base_url %>/sitemap_<%= sitemap.name %>.xml</loc>
<lastmod><%= sitemap.last_posted_at.xmlschema %></lastmod>
</sitemap>
<% end %>
</sitemapindex>

View File

@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"
xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
http://www.google.com/schemas/sitemap-news/0.9
http://www.google.com/schemas/sitemap-news/0.9/sitemap-news.xsd
http://www.google.com/schemas/sitemap-image/1.1
http://www.google.com/schemas/sitemap-image/1.1/sitemap-image.xsd">
<% @topics.each do |(id, title, slug, created_at)| %>
<url>
<loc><%= build_sitemap_topic_url(slug, id) %></loc>
<news:news>
<news:publication>
<news:name><%= SiteSetting.title %></news:name>
<news:language><%= @locale %></news:language>
</news:publication>
<news:publication_date><%= created_at.xmlschema %></news:publication_date>
<news:title><%= title %></news:title>
</news:news>
</url>
<% end %>
</urlset>

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:image="http://www.google.com/schemas/sitemap-image/1.1"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
http://www.google.com/schemas/sitemap-image/1.1
http://www.google.com/schemas/sitemap-image/1.1/sitemap-image.xsd">
<% @topics.each do |(id, slug, bumped_at, updated_at, posts_count)| %>
<url>
<loc><%= build_sitemap_topic_url(slug, id, posts_count) %></loc>
<lastmod><%= (bumped_at || updated_at).xmlschema %></lastmod>
</url>
<% end %>
</urlset>

View File

@ -2335,6 +2335,9 @@ en:
base_font: "Base font to use for most text on the site. Themes can override via the `--font-family` CSS custom property."
heading_font: "Font to use for headings on the site. Themes can override via the `--heading-font-family` CSS custom property."
enable_sitemap: "Generate a sitemap for your site and include it in the robots.txt file."
sitemap_page_size: "Number of URLs to include in each sitemap page. Max 50.000"
short_title: "The short title will be used on the user's home screen, launcher, or other places where space may be limited. It should be limited to 12 characters."
dashboard_hidden_reports: "Allow to hide the specified reports from the dashboard."

View File

@ -26,6 +26,13 @@ Discourse::Application.routes.draw do
post "webhooks/sendgrid" => "webhooks#sendgrid"
post "webhooks/sparkpost" => "webhooks#sparkpost"
scope path: nil, constraints: { format: :xml } do
resources :sitemap, only: [:index]
get "/sitemap_:page" => "sitemap#page", page: /[1-9][0-9]*/
get "/sitemap_recent" => "sitemap#recent"
get "/news" => "sitemap#news"
end
scope path: nil, constraints: { format: /.*/ } do
if Rails.env.development?
mount Sidekiq::Web => "/sidekiq"

View File

@ -360,6 +360,10 @@ basic:
default: "arial"
enum: "BaseFontSetting"
refresh: true
enable_sitemap:
default: true
sitemap_page_size:
default: 10000
login:
invite_only:

View File

@ -0,0 +1,13 @@
# frozen_string_literal: true
class CreateSitemapsTable < ActiveRecord::Migration[6.1]
def change
create_table :sitemaps, if_not_exists: true do |t|
t.string :name, null: false
t.datetime :last_posted_at, null: false
t.boolean :enabled, null: false, default: true
end
add_index :sitemaps, :name, unique: true, if_not_exists: true
end
end

161
spec/models/sitemap_spec.rb Normal file
View File

@ -0,0 +1,161 @@
# frozen_string_literal: true
require 'rails_helper'
describe Sitemap do
describe '.regenerate_sitemaps' do
fab!(:topic) { Fabricate(:topic) }
it 'always create the news and recent sitemaps' do
described_class.regenerate_sitemaps
sitemaps = Sitemap.where(name: [Sitemap::NEWS_SITEMAP_NAME, Sitemap::RECENT_SITEMAP_NAME], enabled: true)
expect(sitemaps.exists?).to eq(true)
end
it 'bumps existing sitemaps last_posted_at attribute' do
news = Sitemap.create!(name: Sitemap::NEWS_SITEMAP_NAME, enabled: true, last_posted_at: 10.days.ago)
described_class.regenerate_sitemaps
expect(news.reload.last_posted_at).to eq_time(topic.updated_at)
end
it 'creates the sitemap first page when there is a topic' do
described_class.regenerate_sitemaps
first_page = Sitemap.find_by(name: '1')
expect(first_page.enabled).to eq(true)
end
it "only counts topics from unrestricted categories" do
restricted_category = Fabricate(:category, read_restricted: true)
topic.update!(category: restricted_category)
Category.update_stats
described_class.regenerate_sitemaps
first_page = Sitemap.find_by(name: '1')
expect(first_page).to be_nil
end
it 'disable empty pages' do
unused_page = Sitemap.touch('10')
described_class.regenerate_sitemaps
expect(unused_page.reload.enabled).to eq(false)
end
end
describe '#topics' do
shared_examples 'Excludes hidden and restricted topics' do
it "doesn't include topics from restricted categories" do
restricted_category = Fabricate(:category, read_restricted: true)
Fabricate(:topic, category: restricted_category)
topics_data = sitemap.topics
expect(topics_data).to be_empty
end
it "doesn't include hidden topics" do
Fabricate(:topic, visible: false)
topics_data = sitemap.topics
expect(topics_data).to be_empty
end
end
context 'when the sitemap corresponds to a page' do
let(:sitemap) { described_class.new(enabled: true, last_posted_at: 1.minute.ago, name: '1') }
it 'returns an empty array if there no topics' do
expect(sitemap.topics).to be_empty
end
it 'returns all the neccessary topic attributes to render a sitemap URL' do
topic = Fabricate(:topic)
topics_data = sitemap.topics.first
expect(topics_data[0]).to eq(topic.id)
expect(topics_data[1]).to eq(topic.slug)
expect(topics_data[2]).to eq_time(topic.bumped_at)
expect(topics_data[3]).to eq_time(topic.updated_at)
end
it 'returns empty when the page is empty because the previous page is not full' do
Fabricate(:topic)
sitemap.name = '2'
topics_data = sitemap.topics
expect(topics_data).to be_empty
end
it "order topics by bumped_at asc" do
topic_1 = Fabricate(:topic, bumped_at: 3.minute.ago)
topic_2 = Fabricate(:topic, bumped_at: 1.minutes.ago)
topic_3 = Fabricate(:topic, bumped_at: 20.minutes.ago)
topic_ids = sitemap.topics.map { |td| td[0] }
expect(topic_ids).to contain_exactly(topic_2.id, topic_1.id, topic_3.id)
end
it_behaves_like 'Excludes hidden and restricted topics'
end
context 'sitemap for recent topics' do
let(:sitemap) do
described_class.new(
enabled: true, last_posted_at: 1.minute.ago, name: described_class::RECENT_SITEMAP_NAME
)
end
it 'return topics that were bumped less than three days ago' do
Fabricate(:topic, bumped_at: 4.days.ago)
recent_topic = Fabricate(:topic, bumped_at: 2.days.ago, posts_count: 3)
topics_data = sitemap.topics
expect(topics_data.length).to eq(1)
recent_topic_data = topics_data.first
expect(recent_topic_data[0]).to eq(recent_topic.id)
expect(recent_topic_data[1]).to eq(recent_topic.slug)
expect(recent_topic_data[2]).to eq_time(recent_topic.bumped_at)
expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at)
expect(recent_topic_data[4]).to eq(recent_topic.posts_count)
end
it_behaves_like 'Excludes hidden and restricted topics'
end
context 'news sitemap' do
let(:sitemap) do
described_class.new(
enabled: true, last_posted_at: 1.minute.ago, name: described_class::NEWS_SITEMAP_NAME
)
end
it 'returns topics that were bumped in the last 72 hours' do
Fabricate(:topic, bumped_at: 73.hours.ago)
recent_topic = Fabricate(:topic, bumped_at: 71.hours.ago)
topics_data = sitemap.topics
expect(topics_data.length).to eq(1)
recent_topic_data = topics_data.first
expect(recent_topic_data[0]).to eq(recent_topic.id)
expect(recent_topic_data[1]).to eq(recent_topic.title)
expect(recent_topic_data[2]).to eq(recent_topic.slug)
expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at)
end
it_behaves_like 'Excludes hidden and restricted topics'
end
end
end

View File

@ -130,5 +130,36 @@ RSpec.describe RobotsTxtController do
expect(response.status).to eq(200)
expect(response.body).to eq(SiteSetting.overridden_robots_txt)
end
describe 'sitemap' do
let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" }
it 'include sitemap location when enabled' do
SiteSetting.enable_sitemap = true
SiteSetting.login_required = false
get '/robots.txt'
expect(response.body).to include(sitemap_line)
end
it "doesn't include sitemap location when disabled" do
SiteSetting.enable_sitemap = false
SiteSetting.login_required = false
get '/robots.txt'
expect(response.body).not_to include(sitemap_line)
end
it "doesn't include sitemap location when site has login_required enabled" do
SiteSetting.enable_sitemap = true
SiteSetting.login_required = true
get '/robots.txt'
expect(response.body).not_to include(sitemap_line)
end
end
end
end

View File

@ -0,0 +1,135 @@
# frozen_string_literal: true
require 'rails_helper'
describe SitemapController do
describe 'before_action :check_sitemap_enabled' do
it 'returns a 404 if sitemap is disabled' do
Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME)
SiteSetting.enable_sitemap = false
get '/sitemap.xml'
expect(response.status).to eq(404)
end
end
describe '#index' do
it "lists no sitemaps if we haven't generated them yet" do
get '/sitemap.xml'
sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc')
expect(sitemaps).to be_empty
end
it 'lists generated sitemaps' do
Sitemap.create!(name: 'recent', enabled: true, last_posted_at: 1.minute.ago)
get '/sitemap.xml'
sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc')
expect(sitemaps.length).to eq(1)
expect(sitemaps.first.text).to include('recent')
end
it "doesn't list disabled sitemaps" do
Sitemap.create!(name: 'recent', enabled: false, last_posted_at: 1.minute.ago)
get '/sitemap.xml'
sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc')
expect(sitemaps).to be_empty
end
end
describe '#page' do
before { Discourse.cache.delete("sitemap/#{1}/#{SiteSetting.sitemap_page_size}") }
it "returns a 404 if the sitemap doesn't exist" do
get '/sitemap_999.xml'
expect(response.status).to eq(404)
end
it 'includes the topics for that page' do
topic = Fabricate(:topic)
Sitemap.create!(name: '1', enabled: true, last_posted_at: 1.minute.ago)
get '/sitemap_1.xml'
url = Nokogiri::XML::Document.parse(response.body).css('url').last
loc = url.at_css('loc').text
last_mod = url.at_css('lastmod').text
expect(response.status).to eq(200)
expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}")
expect(last_mod).to eq(topic.bumped_at.xmlschema)
end
end
describe '#recent' do
let(:sitemap) { Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) }
before { Discourse.cache.delete("sitemap/recent/#{sitemap.last_posted_at.to_i}") }
it 'returns a sitemap with topics bumped in the last three days' do
topic = Fabricate(:topic, bumped_at: 1.minute.ago)
old_topic = Fabricate(:topic, bumped_at: 6.days.ago)
get '/sitemap_recent.xml'
urls = Nokogiri::XML::Document.parse(response.body).css('url')
loc = urls.first.at_css('loc').text
last_mod = urls.first.at_css('lastmod').text
expect(response.status).to eq(200)
expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}")
expect(last_mod).to eq(topic.bumped_at.xmlschema)
all_urls = urls.map { |u| u.at_css('loc').text }
expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}")
end
it 'generates correct page numbers based on the topic post count' do
topic = Fabricate(:topic, bumped_at: 1.minute.ago)
page_size = TopicView.chunk_size
incomplete_page_size = TopicView.chunk_size - 1
topic.update!(posts_count: incomplete_page_size, updated_at: 4.hour.ago)
get '/sitemap_recent.xml'
url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text
expect(url).not_to include('?page=2')
topic.update!(posts_count: page_size, updated_at: 3.hour.ago)
get '/sitemap_recent.xml'
url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text
expect(url).not_to include('?page=2')
two_page_size = page_size + 1
topic.update!(posts_count: two_page_size, updated_at: 2.hour.ago)
get '/sitemap_recent.xml'
url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text
expect(url).to include('?page=2')
end
end
describe '#news' do
let!(:sitemap) { Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME) }
before { Discourse.cache.delete("sitemap/news") }
it 'returns a sitemap with topics bumped in the last 72 hours' do
topic = Fabricate(:topic, bumped_at: 71.hours.ago)
old_topic = Fabricate(:topic, bumped_at: 73.hours.ago)
get '/news.xml'
urls = Nokogiri::XML::Document.parse(response.body).css('url')
loc = urls.first.at_css('loc').text
expect(response.status).to eq(200)
expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}")
all_urls = urls.map { |u| u.at_css('loc').text }
expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}")
end
end
end