diff --git a/app/controllers/sitemap_controller.rb b/app/controllers/sitemap_controller.rb
new file mode 100644
index 00000000000..3ae7efcde04
--- /dev/null
+++ b/app/controllers/sitemap_controller.rb
@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+
+class SitemapController < ApplicationController
+ layout false
+ skip_before_action :preload_json, :check_xhr
+ before_action :check_sitemap_enabled
+
+ def index
+ @sitemaps = Sitemap
+ .where(enabled: true)
+ .where.not(name: Sitemap::NEWS_SITEMAP_NAME)
+
+ render :index
+ end
+
+ def page
+ index = params.require(:page)
+ sitemap = Sitemap.find_by(enabled: true, name: index.to_s)
+ raise Discourse::NotFound if sitemap.nil?
+
+ @output = Rails.cache.fetch("sitemap/#{sitemap.name}/#{sitemap.max_page_size}", expires_in: 24.hours) do
+ @topics = sitemap.topics
+ render :page, content_type: 'text/xml; charset=UTF-8'
+ end
+
+ render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed?
+ end
+
+ def recent
+ sitemap = Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME)
+
+ @output = Rails.cache.fetch("sitemap/recent/#{sitemap.last_posted_at.to_i}", expires_in: 1.hour) do
+ @topics = sitemap.topics
+ render :page, content_type: 'text/xml; charset=UTF-8'
+ end
+
+ render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed?
+ end
+
+ def news
+ sitemap = Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME)
+
+ @output = Rails.cache.fetch("sitemap/news", expires_in: 5.minutes) do
+ dlocale = SiteSetting.default_locale.downcase
+ @locale = dlocale.gsub(/_.*/, '')
+ @locale = dlocale.sub('_', '-') if @locale === "zh"
+ @topics = sitemap.topics
+ render :news, content_type: 'text/xml; charset=UTF-8'
+ end
+
+ render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed?
+ end
+
+ private
+
+ def check_sitemap_enabled
+ raise Discourse::NotFound if !SiteSetting.enable_sitemap
+ end
+
+ def build_sitemap_topic_url(slug, id, posts_count = nil)
+ base_url = [Discourse.base_url, 't', slug, id].join('/')
+ return base_url if posts_count.nil?
+
+ page, mod = posts_count.divmod(TopicView.chunk_size)
+ page += 1 if mod > 0
+
+ page > 1 ? "#{base_url}?page=#{page}" : base_url
+ end
+ helper_method :build_sitemap_topic_url
+
+end
diff --git a/app/jobs/scheduled/regenerate_sitemaps.rb b/app/jobs/scheduled/regenerate_sitemaps.rb
new file mode 100644
index 00000000000..36b9ac9b675
--- /dev/null
+++ b/app/jobs/scheduled/regenerate_sitemaps.rb
@@ -0,0 +1,11 @@
+# frozen_string_literal: true
+
+module Jobs
+ class RegenerateSitemaps < ::Jobs::Scheduled
+ every 1.hour
+
+ def execute(_args)
+ Sitemap.regenerate_sitemaps if SiteSetting.enable_sitemap?
+ end
+ end
+end
diff --git a/app/models/sitemap.rb b/app/models/sitemap.rb
new file mode 100644
index 00000000000..a6ead4692ea
--- /dev/null
+++ b/app/models/sitemap.rb
@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+
+class Sitemap < ActiveRecord::Base
+ RECENT_SITEMAP_NAME = 'recent'
+ NEWS_SITEMAP_NAME = 'news'
+
+ class << self
+ def regenerate_sitemaps
+ names_used = [RECENT_SITEMAP_NAME, NEWS_SITEMAP_NAME]
+
+ names_used.each { |name| touch(name) }
+
+ count = Category.where(read_restricted: false).sum(:topic_count)
+ max_page_size = SiteSetting.sitemap_page_size
+ size, mod = count.divmod(max_page_size)
+ size += 1 if mod > 0
+
+ size.times do |index|
+ page_name = (index + 1).to_s
+ touch(page_name)
+ names_used << page_name
+ end
+
+ where.not(name: names_used).update_all(enabled: false)
+ end
+
+ def touch(name)
+ find_or_initialize_by(name: name).tap do |sitemap|
+ sitemap.update!(
+ last_posted_at: sitemap.last_posted_topic || 3.days.ago,
+ enabled: true
+ )
+ end
+ end
+ end
+
+ def topics
+ if name == RECENT_SITEMAP_NAME
+ sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at, :posts_count)
+ elsif name == NEWS_SITEMAP_NAME
+ sitemap_topics.pluck(:id, :title, :slug, :created_at)
+ else
+ sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at)
+ end
+ end
+
+ def last_posted_topic
+ sitemap_topics.maximum(:updated_at)
+ end
+
+ def max_page_size
+ SiteSetting.sitemap_page_size
+ end
+
+ private
+
+ def sitemap_topics
+ indexable_topics = Topic
+ .where(visible: true)
+ .joins(:category)
+ .where(categories: { read_restricted: false })
+
+ if name == RECENT_SITEMAP_NAME
+ indexable_topics.where('bumped_at > ?', 3.days.ago).order(bumped_at: :desc)
+ elsif name == NEWS_SITEMAP_NAME
+ indexable_topics.where('bumped_at > ?', 72.hours.ago).order(bumped_at: :desc)
+ else
+ offset = (name.to_i - 1) * max_page_size
+
+ indexable_topics.limit(max_page_size).offset(offset)
+ end
+ end
+end
+
+# == Schema Information
+#
+# Table name: sitemaps
+#
+# id :bigint not null, primary key
+# name :string not null
+# last_posted_at :datetime not null
+# enabled :boolean default(TRUE), not null
+#
+# Indexes
+#
+# index_sitemaps_on_name (name) UNIQUE
+#
diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb
index cb007ff69cd..25e6816cdce 100644
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@@ -12,4 +12,8 @@ Disallow: <%= path %>
<% end %>
+<%- if SiteSetting.enable_sitemap? && !SiteSetting.login_required? %>
+Sitemap: <%= request.protocol %><%= request.host_with_port %>/sitemap.xml
+<% end %>
+
<%= server_plugin_outlet "robots_txt_index" %>
diff --git a/app/views/sitemap/index.xml.erb b/app/views/sitemap/index.xml.erb
new file mode 100644
index 00000000000..17ad425883a
--- /dev/null
+++ b/app/views/sitemap/index.xml.erb
@@ -0,0 +1,9 @@
+
+
+ <% @sitemaps.each do |sitemap| %>
+
+ <%= Discourse.base_url %>/sitemap_<%= sitemap.name %>.xml
+ <%= sitemap.last_posted_at.xmlschema %>
+
+ <% end %>
+
diff --git a/app/views/sitemap/news.xml.erb b/app/views/sitemap/news.xml.erb
new file mode 100644
index 00000000000..1d83a614f4c
--- /dev/null
+++ b/app/views/sitemap/news.xml.erb
@@ -0,0 +1,25 @@
+
+
+ <% @topics.each do |(id, title, slug, created_at)| %>
+
+ <%= build_sitemap_topic_url(slug, id) %>
+
+
+ <%= SiteSetting.title %>
+ <%= @locale %>
+
+ <%= created_at.xmlschema %>
+ <%= title %>
+
+
+ <% end %>
+
diff --git a/app/views/sitemap/page.xml.erb b/app/views/sitemap/page.xml.erb
new file mode 100644
index 00000000000..6f69c5bee30
--- /dev/null
+++ b/app/views/sitemap/page.xml.erb
@@ -0,0 +1,15 @@
+
+
+ <% @topics.each do |(id, slug, bumped_at, updated_at, posts_count)| %>
+
+ <%= build_sitemap_topic_url(slug, id, posts_count) %>
+ <%= (bumped_at || updated_at).xmlschema %>
+
+ <% end %>
+
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
index fe6edcf91b5..5d86544ac9b 100644
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@@ -2335,6 +2335,9 @@ en:
base_font: "Base font to use for most text on the site. Themes can override via the `--font-family` CSS custom property."
heading_font: "Font to use for headings on the site. Themes can override via the `--heading-font-family` CSS custom property."
+ enable_sitemap: "Generate a sitemap for your site and include it in the robots.txt file."
+ sitemap_page_size: "Number of URLs to include in each sitemap page. Max 50.000"
+
short_title: "The short title will be used on the user's home screen, launcher, or other places where space may be limited. It should be limited to 12 characters."
dashboard_hidden_reports: "Allow to hide the specified reports from the dashboard."
diff --git a/config/routes.rb b/config/routes.rb
index acbb0c700cb..d88dea760f2 100644
--- a/config/routes.rb
+++ b/config/routes.rb
@@ -26,6 +26,13 @@ Discourse::Application.routes.draw do
post "webhooks/sendgrid" => "webhooks#sendgrid"
post "webhooks/sparkpost" => "webhooks#sparkpost"
+ scope path: nil, constraints: { format: :xml } do
+ resources :sitemap, only: [:index]
+ get "/sitemap_:page" => "sitemap#page", page: /[1-9][0-9]*/
+ get "/sitemap_recent" => "sitemap#recent"
+ get "/news" => "sitemap#news"
+ end
+
scope path: nil, constraints: { format: /.*/ } do
if Rails.env.development?
mount Sidekiq::Web => "/sidekiq"
diff --git a/config/site_settings.yml b/config/site_settings.yml
index 5ccb510d6f0..bbfa5ed6408 100644
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@@ -360,6 +360,10 @@ basic:
default: "arial"
enum: "BaseFontSetting"
refresh: true
+ enable_sitemap:
+ default: true
+ sitemap_page_size:
+ default: 10000
login:
invite_only:
diff --git a/db/migrate/20220331204447_create_sitemaps_table.rb b/db/migrate/20220331204447_create_sitemaps_table.rb
new file mode 100644
index 00000000000..ae18d0eeaeb
--- /dev/null
+++ b/db/migrate/20220331204447_create_sitemaps_table.rb
@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+
+class CreateSitemapsTable < ActiveRecord::Migration[6.1]
+ def change
+ create_table :sitemaps, if_not_exists: true do |t|
+ t.string :name, null: false
+ t.datetime :last_posted_at, null: false
+ t.boolean :enabled, null: false, default: true
+ end
+
+ add_index :sitemaps, :name, unique: true, if_not_exists: true
+ end
+end
diff --git a/spec/models/sitemap_spec.rb b/spec/models/sitemap_spec.rb
new file mode 100644
index 00000000000..d1d63c3b70a
--- /dev/null
+++ b/spec/models/sitemap_spec.rb
@@ -0,0 +1,161 @@
+# frozen_string_literal: true
+
+require 'rails_helper'
+
+describe Sitemap do
+ describe '.regenerate_sitemaps' do
+ fab!(:topic) { Fabricate(:topic) }
+
+ it 'always create the news and recent sitemaps' do
+ described_class.regenerate_sitemaps
+
+ sitemaps = Sitemap.where(name: [Sitemap::NEWS_SITEMAP_NAME, Sitemap::RECENT_SITEMAP_NAME], enabled: true)
+
+ expect(sitemaps.exists?).to eq(true)
+ end
+
+ it 'bumps existing sitemaps last_posted_at attribute' do
+ news = Sitemap.create!(name: Sitemap::NEWS_SITEMAP_NAME, enabled: true, last_posted_at: 10.days.ago)
+
+ described_class.regenerate_sitemaps
+
+ expect(news.reload.last_posted_at).to eq_time(topic.updated_at)
+ end
+
+ it 'creates the sitemap first page when there is a topic' do
+ described_class.regenerate_sitemaps
+ first_page = Sitemap.find_by(name: '1')
+
+ expect(first_page.enabled).to eq(true)
+ end
+
+ it "only counts topics from unrestricted categories" do
+ restricted_category = Fabricate(:category, read_restricted: true)
+ topic.update!(category: restricted_category)
+ Category.update_stats
+
+ described_class.regenerate_sitemaps
+ first_page = Sitemap.find_by(name: '1')
+
+ expect(first_page).to be_nil
+ end
+
+ it 'disable empty pages' do
+ unused_page = Sitemap.touch('10')
+
+ described_class.regenerate_sitemaps
+
+ expect(unused_page.reload.enabled).to eq(false)
+ end
+ end
+
+ describe '#topics' do
+ shared_examples 'Excludes hidden and restricted topics' do
+ it "doesn't include topics from restricted categories" do
+ restricted_category = Fabricate(:category, read_restricted: true)
+ Fabricate(:topic, category: restricted_category)
+
+ topics_data = sitemap.topics
+
+ expect(topics_data).to be_empty
+ end
+
+ it "doesn't include hidden topics" do
+ Fabricate(:topic, visible: false)
+
+ topics_data = sitemap.topics
+
+ expect(topics_data).to be_empty
+ end
+ end
+
+ context 'when the sitemap corresponds to a page' do
+ let(:sitemap) { described_class.new(enabled: true, last_posted_at: 1.minute.ago, name: '1') }
+
+ it 'returns an empty array if there no topics' do
+ expect(sitemap.topics).to be_empty
+ end
+
+ it 'returns all the neccessary topic attributes to render a sitemap URL' do
+ topic = Fabricate(:topic)
+
+ topics_data = sitemap.topics.first
+
+ expect(topics_data[0]).to eq(topic.id)
+ expect(topics_data[1]).to eq(topic.slug)
+ expect(topics_data[2]).to eq_time(topic.bumped_at)
+ expect(topics_data[3]).to eq_time(topic.updated_at)
+ end
+
+ it 'returns empty when the page is empty because the previous page is not full' do
+ Fabricate(:topic)
+ sitemap.name = '2'
+
+ topics_data = sitemap.topics
+
+ expect(topics_data).to be_empty
+ end
+
+ it "order topics by bumped_at asc" do
+ topic_1 = Fabricate(:topic, bumped_at: 3.minute.ago)
+ topic_2 = Fabricate(:topic, bumped_at: 1.minutes.ago)
+ topic_3 = Fabricate(:topic, bumped_at: 20.minutes.ago)
+
+ topic_ids = sitemap.topics.map { |td| td[0] }
+
+ expect(topic_ids).to contain_exactly(topic_2.id, topic_1.id, topic_3.id)
+ end
+
+ it_behaves_like 'Excludes hidden and restricted topics'
+ end
+
+ context 'sitemap for recent topics' do
+ let(:sitemap) do
+ described_class.new(
+ enabled: true, last_posted_at: 1.minute.ago, name: described_class::RECENT_SITEMAP_NAME
+ )
+ end
+
+ it 'return topics that were bumped less than three days ago' do
+ Fabricate(:topic, bumped_at: 4.days.ago)
+ recent_topic = Fabricate(:topic, bumped_at: 2.days.ago, posts_count: 3)
+
+ topics_data = sitemap.topics
+
+ expect(topics_data.length).to eq(1)
+ recent_topic_data = topics_data.first
+ expect(recent_topic_data[0]).to eq(recent_topic.id)
+ expect(recent_topic_data[1]).to eq(recent_topic.slug)
+ expect(recent_topic_data[2]).to eq_time(recent_topic.bumped_at)
+ expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at)
+ expect(recent_topic_data[4]).to eq(recent_topic.posts_count)
+ end
+
+ it_behaves_like 'Excludes hidden and restricted topics'
+ end
+
+ context 'news sitemap' do
+ let(:sitemap) do
+ described_class.new(
+ enabled: true, last_posted_at: 1.minute.ago, name: described_class::NEWS_SITEMAP_NAME
+ )
+ end
+
+ it 'returns topics that were bumped in the last 72 hours' do
+ Fabricate(:topic, bumped_at: 73.hours.ago)
+ recent_topic = Fabricate(:topic, bumped_at: 71.hours.ago)
+
+ topics_data = sitemap.topics
+
+ expect(topics_data.length).to eq(1)
+ recent_topic_data = topics_data.first
+ expect(recent_topic_data[0]).to eq(recent_topic.id)
+ expect(recent_topic_data[1]).to eq(recent_topic.title)
+ expect(recent_topic_data[2]).to eq(recent_topic.slug)
+ expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at)
+ end
+
+ it_behaves_like 'Excludes hidden and restricted topics'
+ end
+ end
+end
diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb
index e5dd0d88d0b..65fc58e05aa 100644
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@@ -130,5 +130,36 @@ RSpec.describe RobotsTxtController do
expect(response.status).to eq(200)
expect(response.body).to eq(SiteSetting.overridden_robots_txt)
end
+
+ describe 'sitemap' do
+ let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" }
+
+ it 'include sitemap location when enabled' do
+ SiteSetting.enable_sitemap = true
+ SiteSetting.login_required = false
+
+ get '/robots.txt'
+
+ expect(response.body).to include(sitemap_line)
+ end
+
+ it "doesn't include sitemap location when disabled" do
+ SiteSetting.enable_sitemap = false
+ SiteSetting.login_required = false
+
+ get '/robots.txt'
+
+ expect(response.body).not_to include(sitemap_line)
+ end
+
+ it "doesn't include sitemap location when site has login_required enabled" do
+ SiteSetting.enable_sitemap = true
+ SiteSetting.login_required = true
+
+ get '/robots.txt'
+
+ expect(response.body).not_to include(sitemap_line)
+ end
+ end
end
end
diff --git a/spec/requests/sitemap_controller_spec.rb b/spec/requests/sitemap_controller_spec.rb
new file mode 100644
index 00000000000..682d08549aa
--- /dev/null
+++ b/spec/requests/sitemap_controller_spec.rb
@@ -0,0 +1,135 @@
+# frozen_string_literal: true
+
+require 'rails_helper'
+
+describe SitemapController do
+ describe 'before_action :check_sitemap_enabled' do
+ it 'returns a 404 if sitemap is disabled' do
+ Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME)
+ SiteSetting.enable_sitemap = false
+
+ get '/sitemap.xml'
+
+ expect(response.status).to eq(404)
+ end
+ end
+
+ describe '#index' do
+ it "lists no sitemaps if we haven't generated them yet" do
+ get '/sitemap.xml'
+
+ sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc')
+
+ expect(sitemaps).to be_empty
+ end
+
+ it 'lists generated sitemaps' do
+ Sitemap.create!(name: 'recent', enabled: true, last_posted_at: 1.minute.ago)
+
+ get '/sitemap.xml'
+ sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc')
+
+ expect(sitemaps.length).to eq(1)
+ expect(sitemaps.first.text).to include('recent')
+ end
+
+ it "doesn't list disabled sitemaps" do
+ Sitemap.create!(name: 'recent', enabled: false, last_posted_at: 1.minute.ago)
+
+ get '/sitemap.xml'
+ sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc')
+
+ expect(sitemaps).to be_empty
+ end
+ end
+
+ describe '#page' do
+ before { Discourse.cache.delete("sitemap/#{1}/#{SiteSetting.sitemap_page_size}") }
+
+ it "returns a 404 if the sitemap doesn't exist" do
+ get '/sitemap_999.xml'
+
+ expect(response.status).to eq(404)
+ end
+
+ it 'includes the topics for that page' do
+ topic = Fabricate(:topic)
+ Sitemap.create!(name: '1', enabled: true, last_posted_at: 1.minute.ago)
+
+ get '/sitemap_1.xml'
+ url = Nokogiri::XML::Document.parse(response.body).css('url').last
+ loc = url.at_css('loc').text
+ last_mod = url.at_css('lastmod').text
+
+ expect(response.status).to eq(200)
+ expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}")
+ expect(last_mod).to eq(topic.bumped_at.xmlschema)
+ end
+ end
+
+ describe '#recent' do
+ let(:sitemap) { Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) }
+
+ before { Discourse.cache.delete("sitemap/recent/#{sitemap.last_posted_at.to_i}") }
+
+ it 'returns a sitemap with topics bumped in the last three days' do
+ topic = Fabricate(:topic, bumped_at: 1.minute.ago)
+ old_topic = Fabricate(:topic, bumped_at: 6.days.ago)
+
+ get '/sitemap_recent.xml'
+ urls = Nokogiri::XML::Document.parse(response.body).css('url')
+ loc = urls.first.at_css('loc').text
+ last_mod = urls.first.at_css('lastmod').text
+
+ expect(response.status).to eq(200)
+ expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}")
+ expect(last_mod).to eq(topic.bumped_at.xmlschema)
+
+ all_urls = urls.map { |u| u.at_css('loc').text }
+ expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}")
+ end
+
+ it 'generates correct page numbers based on the topic post count' do
+ topic = Fabricate(:topic, bumped_at: 1.minute.ago)
+ page_size = TopicView.chunk_size
+
+ incomplete_page_size = TopicView.chunk_size - 1
+ topic.update!(posts_count: incomplete_page_size, updated_at: 4.hour.ago)
+ get '/sitemap_recent.xml'
+ url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text
+ expect(url).not_to include('?page=2')
+
+ topic.update!(posts_count: page_size, updated_at: 3.hour.ago)
+ get '/sitemap_recent.xml'
+ url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text
+ expect(url).not_to include('?page=2')
+
+ two_page_size = page_size + 1
+ topic.update!(posts_count: two_page_size, updated_at: 2.hour.ago)
+ get '/sitemap_recent.xml'
+ url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text
+ expect(url).to include('?page=2')
+ end
+ end
+
+ describe '#news' do
+ let!(:sitemap) { Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME) }
+
+ before { Discourse.cache.delete("sitemap/news") }
+
+ it 'returns a sitemap with topics bumped in the last 72 hours' do
+ topic = Fabricate(:topic, bumped_at: 71.hours.ago)
+ old_topic = Fabricate(:topic, bumped_at: 73.hours.ago)
+
+ get '/news.xml'
+ urls = Nokogiri::XML::Document.parse(response.body).css('url')
+ loc = urls.first.at_css('loc').text
+
+ expect(response.status).to eq(200)
+ expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}")
+
+ all_urls = urls.map { |u| u.at_css('loc').text }
+ expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}")
+ end
+ end
+end