From 6f76a12e0a5532b40f691e1ff51623b89c2bbb59 Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Tue, 12 Apr 2022 10:33:59 -0300 Subject: [PATCH] FEATURE: Let sites add a sitemap.xml file. (#16357) * FEATURE: Let sites add a sitemap.xml file. This PR adds the same features discourse-sitemap provides to core. Sitemaps are only added to the robots.txt file if the `enable_sitemap` setting is enabled and `login_required` disabled. After merging discourse/discourse-sitemap#34, this change will take priority over the sitemap plugin because it will disable itself. We're also using the same sitemaps table, so our migration won't try to create it again using `if_not_exists: true`. --- app/controllers/sitemap_controller.rb | 71 ++++++++ app/jobs/scheduled/regenerate_sitemaps.rb | 11 ++ app/models/sitemap.rb | 87 ++++++++++ app/views/robots_txt/index.erb | 4 + app/views/sitemap/index.xml.erb | 9 + app/views/sitemap/news.xml.erb | 25 +++ app/views/sitemap/page.xml.erb | 15 ++ config/locales/server.en.yml | 3 + config/routes.rb | 7 + config/site_settings.yml | 4 + .../20220331204447_create_sitemaps_table.rb | 13 ++ spec/models/sitemap_spec.rb | 161 ++++++++++++++++++ spec/requests/robots_txt_controller_spec.rb | 31 ++++ spec/requests/sitemap_controller_spec.rb | 135 +++++++++++++++ 14 files changed, 576 insertions(+) create mode 100644 app/controllers/sitemap_controller.rb create mode 100644 app/jobs/scheduled/regenerate_sitemaps.rb create mode 100644 app/models/sitemap.rb create mode 100644 app/views/sitemap/index.xml.erb create mode 100644 app/views/sitemap/news.xml.erb create mode 100644 app/views/sitemap/page.xml.erb create mode 100644 db/migrate/20220331204447_create_sitemaps_table.rb create mode 100644 spec/models/sitemap_spec.rb create mode 100644 spec/requests/sitemap_controller_spec.rb diff --git a/app/controllers/sitemap_controller.rb b/app/controllers/sitemap_controller.rb new file mode 100644 index 00000000000..3ae7efcde04 --- /dev/null +++ b/app/controllers/sitemap_controller.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +class SitemapController < ApplicationController + layout false + skip_before_action :preload_json, :check_xhr + before_action :check_sitemap_enabled + + def index + @sitemaps = Sitemap + .where(enabled: true) + .where.not(name: Sitemap::NEWS_SITEMAP_NAME) + + render :index + end + + def page + index = params.require(:page) + sitemap = Sitemap.find_by(enabled: true, name: index.to_s) + raise Discourse::NotFound if sitemap.nil? + + @output = Rails.cache.fetch("sitemap/#{sitemap.name}/#{sitemap.max_page_size}", expires_in: 24.hours) do + @topics = sitemap.topics + render :page, content_type: 'text/xml; charset=UTF-8' + end + + render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed? + end + + def recent + sitemap = Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) + + @output = Rails.cache.fetch("sitemap/recent/#{sitemap.last_posted_at.to_i}", expires_in: 1.hour) do + @topics = sitemap.topics + render :page, content_type: 'text/xml; charset=UTF-8' + end + + render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed? + end + + def news + sitemap = Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME) + + @output = Rails.cache.fetch("sitemap/news", expires_in: 5.minutes) do + dlocale = SiteSetting.default_locale.downcase + @locale = dlocale.gsub(/_.*/, '') + @locale = dlocale.sub('_', '-') if @locale === "zh" + @topics = sitemap.topics + render :news, content_type: 'text/xml; charset=UTF-8' + end + + render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed? + end + + private + + def check_sitemap_enabled + raise Discourse::NotFound if !SiteSetting.enable_sitemap + end + + def build_sitemap_topic_url(slug, id, posts_count = nil) + base_url = [Discourse.base_url, 't', slug, id].join('/') + return base_url if posts_count.nil? + + page, mod = posts_count.divmod(TopicView.chunk_size) + page += 1 if mod > 0 + + page > 1 ? "#{base_url}?page=#{page}" : base_url + end + helper_method :build_sitemap_topic_url + +end diff --git a/app/jobs/scheduled/regenerate_sitemaps.rb b/app/jobs/scheduled/regenerate_sitemaps.rb new file mode 100644 index 00000000000..36b9ac9b675 --- /dev/null +++ b/app/jobs/scheduled/regenerate_sitemaps.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module Jobs + class RegenerateSitemaps < ::Jobs::Scheduled + every 1.hour + + def execute(_args) + Sitemap.regenerate_sitemaps if SiteSetting.enable_sitemap? + end + end +end diff --git a/app/models/sitemap.rb b/app/models/sitemap.rb new file mode 100644 index 00000000000..a6ead4692ea --- /dev/null +++ b/app/models/sitemap.rb @@ -0,0 +1,87 @@ +# frozen_string_literal: true + +class Sitemap < ActiveRecord::Base + RECENT_SITEMAP_NAME = 'recent' + NEWS_SITEMAP_NAME = 'news' + + class << self + def regenerate_sitemaps + names_used = [RECENT_SITEMAP_NAME, NEWS_SITEMAP_NAME] + + names_used.each { |name| touch(name) } + + count = Category.where(read_restricted: false).sum(:topic_count) + max_page_size = SiteSetting.sitemap_page_size + size, mod = count.divmod(max_page_size) + size += 1 if mod > 0 + + size.times do |index| + page_name = (index + 1).to_s + touch(page_name) + names_used << page_name + end + + where.not(name: names_used).update_all(enabled: false) + end + + def touch(name) + find_or_initialize_by(name: name).tap do |sitemap| + sitemap.update!( + last_posted_at: sitemap.last_posted_topic || 3.days.ago, + enabled: true + ) + end + end + end + + def topics + if name == RECENT_SITEMAP_NAME + sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at, :posts_count) + elsif name == NEWS_SITEMAP_NAME + sitemap_topics.pluck(:id, :title, :slug, :created_at) + else + sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at) + end + end + + def last_posted_topic + sitemap_topics.maximum(:updated_at) + end + + def max_page_size + SiteSetting.sitemap_page_size + end + + private + + def sitemap_topics + indexable_topics = Topic + .where(visible: true) + .joins(:category) + .where(categories: { read_restricted: false }) + + if name == RECENT_SITEMAP_NAME + indexable_topics.where('bumped_at > ?', 3.days.ago).order(bumped_at: :desc) + elsif name == NEWS_SITEMAP_NAME + indexable_topics.where('bumped_at > ?', 72.hours.ago).order(bumped_at: :desc) + else + offset = (name.to_i - 1) * max_page_size + + indexable_topics.limit(max_page_size).offset(offset) + end + end +end + +# == Schema Information +# +# Table name: sitemaps +# +# id :bigint not null, primary key +# name :string not null +# last_posted_at :datetime not null +# enabled :boolean default(TRUE), not null +# +# Indexes +# +# index_sitemaps_on_name (name) UNIQUE +# diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index cb007ff69cd..25e6816cdce 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -12,4 +12,8 @@ Disallow: <%= path %> <% end %> +<%- if SiteSetting.enable_sitemap? && !SiteSetting.login_required? %> +Sitemap: <%= request.protocol %><%= request.host_with_port %>/sitemap.xml +<% end %> + <%= server_plugin_outlet "robots_txt_index" %> diff --git a/app/views/sitemap/index.xml.erb b/app/views/sitemap/index.xml.erb new file mode 100644 index 00000000000..17ad425883a --- /dev/null +++ b/app/views/sitemap/index.xml.erb @@ -0,0 +1,9 @@ + + + <% @sitemaps.each do |sitemap| %> + + <%= Discourse.base_url %>/sitemap_<%= sitemap.name %>.xml + <%= sitemap.last_posted_at.xmlschema %> + + <% end %> + diff --git a/app/views/sitemap/news.xml.erb b/app/views/sitemap/news.xml.erb new file mode 100644 index 00000000000..1d83a614f4c --- /dev/null +++ b/app/views/sitemap/news.xml.erb @@ -0,0 +1,25 @@ + + + <% @topics.each do |(id, title, slug, created_at)| %> + + <%= build_sitemap_topic_url(slug, id) %> + + + <%= SiteSetting.title %> + <%= @locale %> + + <%= created_at.xmlschema %> + <%= title %> + + + <% end %> + diff --git a/app/views/sitemap/page.xml.erb b/app/views/sitemap/page.xml.erb new file mode 100644 index 00000000000..6f69c5bee30 --- /dev/null +++ b/app/views/sitemap/page.xml.erb @@ -0,0 +1,15 @@ + + + <% @topics.each do |(id, slug, bumped_at, updated_at, posts_count)| %> + + <%= build_sitemap_topic_url(slug, id, posts_count) %> + <%= (bumped_at || updated_at).xmlschema %> + + <% end %> + diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index fe6edcf91b5..5d86544ac9b 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -2335,6 +2335,9 @@ en: base_font: "Base font to use for most text on the site. Themes can override via the `--font-family` CSS custom property." heading_font: "Font to use for headings on the site. Themes can override via the `--heading-font-family` CSS custom property." + enable_sitemap: "Generate a sitemap for your site and include it in the robots.txt file." + sitemap_page_size: "Number of URLs to include in each sitemap page. Max 50.000" + short_title: "The short title will be used on the user's home screen, launcher, or other places where space may be limited. It should be limited to 12 characters." dashboard_hidden_reports: "Allow to hide the specified reports from the dashboard." diff --git a/config/routes.rb b/config/routes.rb index acbb0c700cb..d88dea760f2 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -26,6 +26,13 @@ Discourse::Application.routes.draw do post "webhooks/sendgrid" => "webhooks#sendgrid" post "webhooks/sparkpost" => "webhooks#sparkpost" + scope path: nil, constraints: { format: :xml } do + resources :sitemap, only: [:index] + get "/sitemap_:page" => "sitemap#page", page: /[1-9][0-9]*/ + get "/sitemap_recent" => "sitemap#recent" + get "/news" => "sitemap#news" + end + scope path: nil, constraints: { format: /.*/ } do if Rails.env.development? mount Sidekiq::Web => "/sidekiq" diff --git a/config/site_settings.yml b/config/site_settings.yml index 5ccb510d6f0..bbfa5ed6408 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -360,6 +360,10 @@ basic: default: "arial" enum: "BaseFontSetting" refresh: true + enable_sitemap: + default: true + sitemap_page_size: + default: 10000 login: invite_only: diff --git a/db/migrate/20220331204447_create_sitemaps_table.rb b/db/migrate/20220331204447_create_sitemaps_table.rb new file mode 100644 index 00000000000..ae18d0eeaeb --- /dev/null +++ b/db/migrate/20220331204447_create_sitemaps_table.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class CreateSitemapsTable < ActiveRecord::Migration[6.1] + def change + create_table :sitemaps, if_not_exists: true do |t| + t.string :name, null: false + t.datetime :last_posted_at, null: false + t.boolean :enabled, null: false, default: true + end + + add_index :sitemaps, :name, unique: true, if_not_exists: true + end +end diff --git a/spec/models/sitemap_spec.rb b/spec/models/sitemap_spec.rb new file mode 100644 index 00000000000..d1d63c3b70a --- /dev/null +++ b/spec/models/sitemap_spec.rb @@ -0,0 +1,161 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe Sitemap do + describe '.regenerate_sitemaps' do + fab!(:topic) { Fabricate(:topic) } + + it 'always create the news and recent sitemaps' do + described_class.regenerate_sitemaps + + sitemaps = Sitemap.where(name: [Sitemap::NEWS_SITEMAP_NAME, Sitemap::RECENT_SITEMAP_NAME], enabled: true) + + expect(sitemaps.exists?).to eq(true) + end + + it 'bumps existing sitemaps last_posted_at attribute' do + news = Sitemap.create!(name: Sitemap::NEWS_SITEMAP_NAME, enabled: true, last_posted_at: 10.days.ago) + + described_class.regenerate_sitemaps + + expect(news.reload.last_posted_at).to eq_time(topic.updated_at) + end + + it 'creates the sitemap first page when there is a topic' do + described_class.regenerate_sitemaps + first_page = Sitemap.find_by(name: '1') + + expect(first_page.enabled).to eq(true) + end + + it "only counts topics from unrestricted categories" do + restricted_category = Fabricate(:category, read_restricted: true) + topic.update!(category: restricted_category) + Category.update_stats + + described_class.regenerate_sitemaps + first_page = Sitemap.find_by(name: '1') + + expect(first_page).to be_nil + end + + it 'disable empty pages' do + unused_page = Sitemap.touch('10') + + described_class.regenerate_sitemaps + + expect(unused_page.reload.enabled).to eq(false) + end + end + + describe '#topics' do + shared_examples 'Excludes hidden and restricted topics' do + it "doesn't include topics from restricted categories" do + restricted_category = Fabricate(:category, read_restricted: true) + Fabricate(:topic, category: restricted_category) + + topics_data = sitemap.topics + + expect(topics_data).to be_empty + end + + it "doesn't include hidden topics" do + Fabricate(:topic, visible: false) + + topics_data = sitemap.topics + + expect(topics_data).to be_empty + end + end + + context 'when the sitemap corresponds to a page' do + let(:sitemap) { described_class.new(enabled: true, last_posted_at: 1.minute.ago, name: '1') } + + it 'returns an empty array if there no topics' do + expect(sitemap.topics).to be_empty + end + + it 'returns all the neccessary topic attributes to render a sitemap URL' do + topic = Fabricate(:topic) + + topics_data = sitemap.topics.first + + expect(topics_data[0]).to eq(topic.id) + expect(topics_data[1]).to eq(topic.slug) + expect(topics_data[2]).to eq_time(topic.bumped_at) + expect(topics_data[3]).to eq_time(topic.updated_at) + end + + it 'returns empty when the page is empty because the previous page is not full' do + Fabricate(:topic) + sitemap.name = '2' + + topics_data = sitemap.topics + + expect(topics_data).to be_empty + end + + it "order topics by bumped_at asc" do + topic_1 = Fabricate(:topic, bumped_at: 3.minute.ago) + topic_2 = Fabricate(:topic, bumped_at: 1.minutes.ago) + topic_3 = Fabricate(:topic, bumped_at: 20.minutes.ago) + + topic_ids = sitemap.topics.map { |td| td[0] } + + expect(topic_ids).to contain_exactly(topic_2.id, topic_1.id, topic_3.id) + end + + it_behaves_like 'Excludes hidden and restricted topics' + end + + context 'sitemap for recent topics' do + let(:sitemap) do + described_class.new( + enabled: true, last_posted_at: 1.minute.ago, name: described_class::RECENT_SITEMAP_NAME + ) + end + + it 'return topics that were bumped less than three days ago' do + Fabricate(:topic, bumped_at: 4.days.ago) + recent_topic = Fabricate(:topic, bumped_at: 2.days.ago, posts_count: 3) + + topics_data = sitemap.topics + + expect(topics_data.length).to eq(1) + recent_topic_data = topics_data.first + expect(recent_topic_data[0]).to eq(recent_topic.id) + expect(recent_topic_data[1]).to eq(recent_topic.slug) + expect(recent_topic_data[2]).to eq_time(recent_topic.bumped_at) + expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at) + expect(recent_topic_data[4]).to eq(recent_topic.posts_count) + end + + it_behaves_like 'Excludes hidden and restricted topics' + end + + context 'news sitemap' do + let(:sitemap) do + described_class.new( + enabled: true, last_posted_at: 1.minute.ago, name: described_class::NEWS_SITEMAP_NAME + ) + end + + it 'returns topics that were bumped in the last 72 hours' do + Fabricate(:topic, bumped_at: 73.hours.ago) + recent_topic = Fabricate(:topic, bumped_at: 71.hours.ago) + + topics_data = sitemap.topics + + expect(topics_data.length).to eq(1) + recent_topic_data = topics_data.first + expect(recent_topic_data[0]).to eq(recent_topic.id) + expect(recent_topic_data[1]).to eq(recent_topic.title) + expect(recent_topic_data[2]).to eq(recent_topic.slug) + expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at) + end + + it_behaves_like 'Excludes hidden and restricted topics' + end + end +end diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index e5dd0d88d0b..65fc58e05aa 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -130,5 +130,36 @@ RSpec.describe RobotsTxtController do expect(response.status).to eq(200) expect(response.body).to eq(SiteSetting.overridden_robots_txt) end + + describe 'sitemap' do + let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" } + + it 'include sitemap location when enabled' do + SiteSetting.enable_sitemap = true + SiteSetting.login_required = false + + get '/robots.txt' + + expect(response.body).to include(sitemap_line) + end + + it "doesn't include sitemap location when disabled" do + SiteSetting.enable_sitemap = false + SiteSetting.login_required = false + + get '/robots.txt' + + expect(response.body).not_to include(sitemap_line) + end + + it "doesn't include sitemap location when site has login_required enabled" do + SiteSetting.enable_sitemap = true + SiteSetting.login_required = true + + get '/robots.txt' + + expect(response.body).not_to include(sitemap_line) + end + end end end diff --git a/spec/requests/sitemap_controller_spec.rb b/spec/requests/sitemap_controller_spec.rb new file mode 100644 index 00000000000..682d08549aa --- /dev/null +++ b/spec/requests/sitemap_controller_spec.rb @@ -0,0 +1,135 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe SitemapController do + describe 'before_action :check_sitemap_enabled' do + it 'returns a 404 if sitemap is disabled' do + Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) + SiteSetting.enable_sitemap = false + + get '/sitemap.xml' + + expect(response.status).to eq(404) + end + end + + describe '#index' do + it "lists no sitemaps if we haven't generated them yet" do + get '/sitemap.xml' + + sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc') + + expect(sitemaps).to be_empty + end + + it 'lists generated sitemaps' do + Sitemap.create!(name: 'recent', enabled: true, last_posted_at: 1.minute.ago) + + get '/sitemap.xml' + sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc') + + expect(sitemaps.length).to eq(1) + expect(sitemaps.first.text).to include('recent') + end + + it "doesn't list disabled sitemaps" do + Sitemap.create!(name: 'recent', enabled: false, last_posted_at: 1.minute.ago) + + get '/sitemap.xml' + sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc') + + expect(sitemaps).to be_empty + end + end + + describe '#page' do + before { Discourse.cache.delete("sitemap/#{1}/#{SiteSetting.sitemap_page_size}") } + + it "returns a 404 if the sitemap doesn't exist" do + get '/sitemap_999.xml' + + expect(response.status).to eq(404) + end + + it 'includes the topics for that page' do + topic = Fabricate(:topic) + Sitemap.create!(name: '1', enabled: true, last_posted_at: 1.minute.ago) + + get '/sitemap_1.xml' + url = Nokogiri::XML::Document.parse(response.body).css('url').last + loc = url.at_css('loc').text + last_mod = url.at_css('lastmod').text + + expect(response.status).to eq(200) + expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}") + expect(last_mod).to eq(topic.bumped_at.xmlschema) + end + end + + describe '#recent' do + let(:sitemap) { Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) } + + before { Discourse.cache.delete("sitemap/recent/#{sitemap.last_posted_at.to_i}") } + + it 'returns a sitemap with topics bumped in the last three days' do + topic = Fabricate(:topic, bumped_at: 1.minute.ago) + old_topic = Fabricate(:topic, bumped_at: 6.days.ago) + + get '/sitemap_recent.xml' + urls = Nokogiri::XML::Document.parse(response.body).css('url') + loc = urls.first.at_css('loc').text + last_mod = urls.first.at_css('lastmod').text + + expect(response.status).to eq(200) + expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}") + expect(last_mod).to eq(topic.bumped_at.xmlschema) + + all_urls = urls.map { |u| u.at_css('loc').text } + expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}") + end + + it 'generates correct page numbers based on the topic post count' do + topic = Fabricate(:topic, bumped_at: 1.minute.ago) + page_size = TopicView.chunk_size + + incomplete_page_size = TopicView.chunk_size - 1 + topic.update!(posts_count: incomplete_page_size, updated_at: 4.hour.ago) + get '/sitemap_recent.xml' + url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text + expect(url).not_to include('?page=2') + + topic.update!(posts_count: page_size, updated_at: 3.hour.ago) + get '/sitemap_recent.xml' + url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text + expect(url).not_to include('?page=2') + + two_page_size = page_size + 1 + topic.update!(posts_count: two_page_size, updated_at: 2.hour.ago) + get '/sitemap_recent.xml' + url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text + expect(url).to include('?page=2') + end + end + + describe '#news' do + let!(:sitemap) { Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME) } + + before { Discourse.cache.delete("sitemap/news") } + + it 'returns a sitemap with topics bumped in the last 72 hours' do + topic = Fabricate(:topic, bumped_at: 71.hours.ago) + old_topic = Fabricate(:topic, bumped_at: 73.hours.ago) + + get '/news.xml' + urls = Nokogiri::XML::Document.parse(response.body).css('url') + loc = urls.first.at_css('loc').text + + expect(response.status).to eq(200) + expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}") + + all_urls = urls.map { |u| u.at_css('loc').text } + expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}") + end + end +end