diff --git a/app/controllers/sitemap_controller.rb b/app/controllers/sitemap_controller.rb new file mode 100644 index 00000000000..3ae7efcde04 --- /dev/null +++ b/app/controllers/sitemap_controller.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +class SitemapController < ApplicationController + layout false + skip_before_action :preload_json, :check_xhr + before_action :check_sitemap_enabled + + def index + @sitemaps = Sitemap + .where(enabled: true) + .where.not(name: Sitemap::NEWS_SITEMAP_NAME) + + render :index + end + + def page + index = params.require(:page) + sitemap = Sitemap.find_by(enabled: true, name: index.to_s) + raise Discourse::NotFound if sitemap.nil? + + @output = Rails.cache.fetch("sitemap/#{sitemap.name}/#{sitemap.max_page_size}", expires_in: 24.hours) do + @topics = sitemap.topics + render :page, content_type: 'text/xml; charset=UTF-8' + end + + render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed? + end + + def recent + sitemap = Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) + + @output = Rails.cache.fetch("sitemap/recent/#{sitemap.last_posted_at.to_i}", expires_in: 1.hour) do + @topics = sitemap.topics + render :page, content_type: 'text/xml; charset=UTF-8' + end + + render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed? + end + + def news + sitemap = Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME) + + @output = Rails.cache.fetch("sitemap/news", expires_in: 5.minutes) do + dlocale = SiteSetting.default_locale.downcase + @locale = dlocale.gsub(/_.*/, '') + @locale = dlocale.sub('_', '-') if @locale === "zh" + @topics = sitemap.topics + render :news, content_type: 'text/xml; charset=UTF-8' + end + + render plain: @output, content_type: 'text/xml; charset=UTF-8' unless performed? + end + + private + + def check_sitemap_enabled + raise Discourse::NotFound if !SiteSetting.enable_sitemap + end + + def build_sitemap_topic_url(slug, id, posts_count = nil) + base_url = [Discourse.base_url, 't', slug, id].join('/') + return base_url if posts_count.nil? + + page, mod = posts_count.divmod(TopicView.chunk_size) + page += 1 if mod > 0 + + page > 1 ? "#{base_url}?page=#{page}" : base_url + end + helper_method :build_sitemap_topic_url + +end diff --git a/app/jobs/scheduled/regenerate_sitemaps.rb b/app/jobs/scheduled/regenerate_sitemaps.rb new file mode 100644 index 00000000000..36b9ac9b675 --- /dev/null +++ b/app/jobs/scheduled/regenerate_sitemaps.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module Jobs + class RegenerateSitemaps < ::Jobs::Scheduled + every 1.hour + + def execute(_args) + Sitemap.regenerate_sitemaps if SiteSetting.enable_sitemap? + end + end +end diff --git a/app/models/sitemap.rb b/app/models/sitemap.rb new file mode 100644 index 00000000000..a6ead4692ea --- /dev/null +++ b/app/models/sitemap.rb @@ -0,0 +1,87 @@ +# frozen_string_literal: true + +class Sitemap < ActiveRecord::Base + RECENT_SITEMAP_NAME = 'recent' + NEWS_SITEMAP_NAME = 'news' + + class << self + def regenerate_sitemaps + names_used = [RECENT_SITEMAP_NAME, NEWS_SITEMAP_NAME] + + names_used.each { |name| touch(name) } + + count = Category.where(read_restricted: false).sum(:topic_count) + max_page_size = SiteSetting.sitemap_page_size + size, mod = count.divmod(max_page_size) + size += 1 if mod > 0 + + size.times do |index| + page_name = (index + 1).to_s + touch(page_name) + names_used << page_name + end + + where.not(name: names_used).update_all(enabled: false) + end + + def touch(name) + find_or_initialize_by(name: name).tap do |sitemap| + sitemap.update!( + last_posted_at: sitemap.last_posted_topic || 3.days.ago, + enabled: true + ) + end + end + end + + def topics + if name == RECENT_SITEMAP_NAME + sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at, :posts_count) + elsif name == NEWS_SITEMAP_NAME + sitemap_topics.pluck(:id, :title, :slug, :created_at) + else + sitemap_topics.pluck(:id, :slug, :bumped_at, :updated_at) + end + end + + def last_posted_topic + sitemap_topics.maximum(:updated_at) + end + + def max_page_size + SiteSetting.sitemap_page_size + end + + private + + def sitemap_topics + indexable_topics = Topic + .where(visible: true) + .joins(:category) + .where(categories: { read_restricted: false }) + + if name == RECENT_SITEMAP_NAME + indexable_topics.where('bumped_at > ?', 3.days.ago).order(bumped_at: :desc) + elsif name == NEWS_SITEMAP_NAME + indexable_topics.where('bumped_at > ?', 72.hours.ago).order(bumped_at: :desc) + else + offset = (name.to_i - 1) * max_page_size + + indexable_topics.limit(max_page_size).offset(offset) + end + end +end + +# == Schema Information +# +# Table name: sitemaps +# +# id :bigint not null, primary key +# name :string not null +# last_posted_at :datetime not null +# enabled :boolean default(TRUE), not null +# +# Indexes +# +# index_sitemaps_on_name (name) UNIQUE +# diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index cb007ff69cd..25e6816cdce 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -12,4 +12,8 @@ Disallow: <%= path %> <% end %> +<%- if SiteSetting.enable_sitemap? && !SiteSetting.login_required? %> +Sitemap: <%= request.protocol %><%= request.host_with_port %>/sitemap.xml +<% end %> + <%= server_plugin_outlet "robots_txt_index" %> diff --git a/app/views/sitemap/index.xml.erb b/app/views/sitemap/index.xml.erb new file mode 100644 index 00000000000..17ad425883a --- /dev/null +++ b/app/views/sitemap/index.xml.erb @@ -0,0 +1,9 @@ + + + <% @sitemaps.each do |sitemap| %> + + <%= Discourse.base_url %>/sitemap_<%= sitemap.name %>.xml + <%= sitemap.last_posted_at.xmlschema %> + + <% end %> + diff --git a/app/views/sitemap/news.xml.erb b/app/views/sitemap/news.xml.erb new file mode 100644 index 00000000000..1d83a614f4c --- /dev/null +++ b/app/views/sitemap/news.xml.erb @@ -0,0 +1,25 @@ + + + <% @topics.each do |(id, title, slug, created_at)| %> + + <%= build_sitemap_topic_url(slug, id) %> + + + <%= SiteSetting.title %> + <%= @locale %> + + <%= created_at.xmlschema %> + <%= title %> + + + <% end %> + diff --git a/app/views/sitemap/page.xml.erb b/app/views/sitemap/page.xml.erb new file mode 100644 index 00000000000..6f69c5bee30 --- /dev/null +++ b/app/views/sitemap/page.xml.erb @@ -0,0 +1,15 @@ + + + <% @topics.each do |(id, slug, bumped_at, updated_at, posts_count)| %> + + <%= build_sitemap_topic_url(slug, id, posts_count) %> + <%= (bumped_at || updated_at).xmlschema %> + + <% end %> + diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index fe6edcf91b5..5d86544ac9b 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -2335,6 +2335,9 @@ en: base_font: "Base font to use for most text on the site. Themes can override via the `--font-family` CSS custom property." heading_font: "Font to use for headings on the site. Themes can override via the `--heading-font-family` CSS custom property." + enable_sitemap: "Generate a sitemap for your site and include it in the robots.txt file." + sitemap_page_size: "Number of URLs to include in each sitemap page. Max 50.000" + short_title: "The short title will be used on the user's home screen, launcher, or other places where space may be limited. It should be limited to 12 characters." dashboard_hidden_reports: "Allow to hide the specified reports from the dashboard." diff --git a/config/routes.rb b/config/routes.rb index acbb0c700cb..d88dea760f2 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -26,6 +26,13 @@ Discourse::Application.routes.draw do post "webhooks/sendgrid" => "webhooks#sendgrid" post "webhooks/sparkpost" => "webhooks#sparkpost" + scope path: nil, constraints: { format: :xml } do + resources :sitemap, only: [:index] + get "/sitemap_:page" => "sitemap#page", page: /[1-9][0-9]*/ + get "/sitemap_recent" => "sitemap#recent" + get "/news" => "sitemap#news" + end + scope path: nil, constraints: { format: /.*/ } do if Rails.env.development? mount Sidekiq::Web => "/sidekiq" diff --git a/config/site_settings.yml b/config/site_settings.yml index 5ccb510d6f0..bbfa5ed6408 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -360,6 +360,10 @@ basic: default: "arial" enum: "BaseFontSetting" refresh: true + enable_sitemap: + default: true + sitemap_page_size: + default: 10000 login: invite_only: diff --git a/db/migrate/20220331204447_create_sitemaps_table.rb b/db/migrate/20220331204447_create_sitemaps_table.rb new file mode 100644 index 00000000000..ae18d0eeaeb --- /dev/null +++ b/db/migrate/20220331204447_create_sitemaps_table.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class CreateSitemapsTable < ActiveRecord::Migration[6.1] + def change + create_table :sitemaps, if_not_exists: true do |t| + t.string :name, null: false + t.datetime :last_posted_at, null: false + t.boolean :enabled, null: false, default: true + end + + add_index :sitemaps, :name, unique: true, if_not_exists: true + end +end diff --git a/spec/models/sitemap_spec.rb b/spec/models/sitemap_spec.rb new file mode 100644 index 00000000000..d1d63c3b70a --- /dev/null +++ b/spec/models/sitemap_spec.rb @@ -0,0 +1,161 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe Sitemap do + describe '.regenerate_sitemaps' do + fab!(:topic) { Fabricate(:topic) } + + it 'always create the news and recent sitemaps' do + described_class.regenerate_sitemaps + + sitemaps = Sitemap.where(name: [Sitemap::NEWS_SITEMAP_NAME, Sitemap::RECENT_SITEMAP_NAME], enabled: true) + + expect(sitemaps.exists?).to eq(true) + end + + it 'bumps existing sitemaps last_posted_at attribute' do + news = Sitemap.create!(name: Sitemap::NEWS_SITEMAP_NAME, enabled: true, last_posted_at: 10.days.ago) + + described_class.regenerate_sitemaps + + expect(news.reload.last_posted_at).to eq_time(topic.updated_at) + end + + it 'creates the sitemap first page when there is a topic' do + described_class.regenerate_sitemaps + first_page = Sitemap.find_by(name: '1') + + expect(first_page.enabled).to eq(true) + end + + it "only counts topics from unrestricted categories" do + restricted_category = Fabricate(:category, read_restricted: true) + topic.update!(category: restricted_category) + Category.update_stats + + described_class.regenerate_sitemaps + first_page = Sitemap.find_by(name: '1') + + expect(first_page).to be_nil + end + + it 'disable empty pages' do + unused_page = Sitemap.touch('10') + + described_class.regenerate_sitemaps + + expect(unused_page.reload.enabled).to eq(false) + end + end + + describe '#topics' do + shared_examples 'Excludes hidden and restricted topics' do + it "doesn't include topics from restricted categories" do + restricted_category = Fabricate(:category, read_restricted: true) + Fabricate(:topic, category: restricted_category) + + topics_data = sitemap.topics + + expect(topics_data).to be_empty + end + + it "doesn't include hidden topics" do + Fabricate(:topic, visible: false) + + topics_data = sitemap.topics + + expect(topics_data).to be_empty + end + end + + context 'when the sitemap corresponds to a page' do + let(:sitemap) { described_class.new(enabled: true, last_posted_at: 1.minute.ago, name: '1') } + + it 'returns an empty array if there no topics' do + expect(sitemap.topics).to be_empty + end + + it 'returns all the neccessary topic attributes to render a sitemap URL' do + topic = Fabricate(:topic) + + topics_data = sitemap.topics.first + + expect(topics_data[0]).to eq(topic.id) + expect(topics_data[1]).to eq(topic.slug) + expect(topics_data[2]).to eq_time(topic.bumped_at) + expect(topics_data[3]).to eq_time(topic.updated_at) + end + + it 'returns empty when the page is empty because the previous page is not full' do + Fabricate(:topic) + sitemap.name = '2' + + topics_data = sitemap.topics + + expect(topics_data).to be_empty + end + + it "order topics by bumped_at asc" do + topic_1 = Fabricate(:topic, bumped_at: 3.minute.ago) + topic_2 = Fabricate(:topic, bumped_at: 1.minutes.ago) + topic_3 = Fabricate(:topic, bumped_at: 20.minutes.ago) + + topic_ids = sitemap.topics.map { |td| td[0] } + + expect(topic_ids).to contain_exactly(topic_2.id, topic_1.id, topic_3.id) + end + + it_behaves_like 'Excludes hidden and restricted topics' + end + + context 'sitemap for recent topics' do + let(:sitemap) do + described_class.new( + enabled: true, last_posted_at: 1.minute.ago, name: described_class::RECENT_SITEMAP_NAME + ) + end + + it 'return topics that were bumped less than three days ago' do + Fabricate(:topic, bumped_at: 4.days.ago) + recent_topic = Fabricate(:topic, bumped_at: 2.days.ago, posts_count: 3) + + topics_data = sitemap.topics + + expect(topics_data.length).to eq(1) + recent_topic_data = topics_data.first + expect(recent_topic_data[0]).to eq(recent_topic.id) + expect(recent_topic_data[1]).to eq(recent_topic.slug) + expect(recent_topic_data[2]).to eq_time(recent_topic.bumped_at) + expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at) + expect(recent_topic_data[4]).to eq(recent_topic.posts_count) + end + + it_behaves_like 'Excludes hidden and restricted topics' + end + + context 'news sitemap' do + let(:sitemap) do + described_class.new( + enabled: true, last_posted_at: 1.minute.ago, name: described_class::NEWS_SITEMAP_NAME + ) + end + + it 'returns topics that were bumped in the last 72 hours' do + Fabricate(:topic, bumped_at: 73.hours.ago) + recent_topic = Fabricate(:topic, bumped_at: 71.hours.ago) + + topics_data = sitemap.topics + + expect(topics_data.length).to eq(1) + recent_topic_data = topics_data.first + expect(recent_topic_data[0]).to eq(recent_topic.id) + expect(recent_topic_data[1]).to eq(recent_topic.title) + expect(recent_topic_data[2]).to eq(recent_topic.slug) + expect(recent_topic_data[3]).to eq_time(recent_topic.updated_at) + end + + it_behaves_like 'Excludes hidden and restricted topics' + end + end +end diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index e5dd0d88d0b..65fc58e05aa 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -130,5 +130,36 @@ RSpec.describe RobotsTxtController do expect(response.status).to eq(200) expect(response.body).to eq(SiteSetting.overridden_robots_txt) end + + describe 'sitemap' do + let(:sitemap_line) { "Sitemap: #{Discourse.base_protocol}://#{Discourse.current_hostname}/sitemap.xml" } + + it 'include sitemap location when enabled' do + SiteSetting.enable_sitemap = true + SiteSetting.login_required = false + + get '/robots.txt' + + expect(response.body).to include(sitemap_line) + end + + it "doesn't include sitemap location when disabled" do + SiteSetting.enable_sitemap = false + SiteSetting.login_required = false + + get '/robots.txt' + + expect(response.body).not_to include(sitemap_line) + end + + it "doesn't include sitemap location when site has login_required enabled" do + SiteSetting.enable_sitemap = true + SiteSetting.login_required = true + + get '/robots.txt' + + expect(response.body).not_to include(sitemap_line) + end + end end end diff --git a/spec/requests/sitemap_controller_spec.rb b/spec/requests/sitemap_controller_spec.rb new file mode 100644 index 00000000000..682d08549aa --- /dev/null +++ b/spec/requests/sitemap_controller_spec.rb @@ -0,0 +1,135 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe SitemapController do + describe 'before_action :check_sitemap_enabled' do + it 'returns a 404 if sitemap is disabled' do + Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) + SiteSetting.enable_sitemap = false + + get '/sitemap.xml' + + expect(response.status).to eq(404) + end + end + + describe '#index' do + it "lists no sitemaps if we haven't generated them yet" do + get '/sitemap.xml' + + sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc') + + expect(sitemaps).to be_empty + end + + it 'lists generated sitemaps' do + Sitemap.create!(name: 'recent', enabled: true, last_posted_at: 1.minute.ago) + + get '/sitemap.xml' + sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc') + + expect(sitemaps.length).to eq(1) + expect(sitemaps.first.text).to include('recent') + end + + it "doesn't list disabled sitemaps" do + Sitemap.create!(name: 'recent', enabled: false, last_posted_at: 1.minute.ago) + + get '/sitemap.xml' + sitemaps = Nokogiri::XML::Document.parse(response.body).css('loc') + + expect(sitemaps).to be_empty + end + end + + describe '#page' do + before { Discourse.cache.delete("sitemap/#{1}/#{SiteSetting.sitemap_page_size}") } + + it "returns a 404 if the sitemap doesn't exist" do + get '/sitemap_999.xml' + + expect(response.status).to eq(404) + end + + it 'includes the topics for that page' do + topic = Fabricate(:topic) + Sitemap.create!(name: '1', enabled: true, last_posted_at: 1.minute.ago) + + get '/sitemap_1.xml' + url = Nokogiri::XML::Document.parse(response.body).css('url').last + loc = url.at_css('loc').text + last_mod = url.at_css('lastmod').text + + expect(response.status).to eq(200) + expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}") + expect(last_mod).to eq(topic.bumped_at.xmlschema) + end + end + + describe '#recent' do + let(:sitemap) { Sitemap.touch(Sitemap::RECENT_SITEMAP_NAME) } + + before { Discourse.cache.delete("sitemap/recent/#{sitemap.last_posted_at.to_i}") } + + it 'returns a sitemap with topics bumped in the last three days' do + topic = Fabricate(:topic, bumped_at: 1.minute.ago) + old_topic = Fabricate(:topic, bumped_at: 6.days.ago) + + get '/sitemap_recent.xml' + urls = Nokogiri::XML::Document.parse(response.body).css('url') + loc = urls.first.at_css('loc').text + last_mod = urls.first.at_css('lastmod').text + + expect(response.status).to eq(200) + expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}") + expect(last_mod).to eq(topic.bumped_at.xmlschema) + + all_urls = urls.map { |u| u.at_css('loc').text } + expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}") + end + + it 'generates correct page numbers based on the topic post count' do + topic = Fabricate(:topic, bumped_at: 1.minute.ago) + page_size = TopicView.chunk_size + + incomplete_page_size = TopicView.chunk_size - 1 + topic.update!(posts_count: incomplete_page_size, updated_at: 4.hour.ago) + get '/sitemap_recent.xml' + url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text + expect(url).not_to include('?page=2') + + topic.update!(posts_count: page_size, updated_at: 3.hour.ago) + get '/sitemap_recent.xml' + url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text + expect(url).not_to include('?page=2') + + two_page_size = page_size + 1 + topic.update!(posts_count: two_page_size, updated_at: 2.hour.ago) + get '/sitemap_recent.xml' + url = Nokogiri::XML::Document.parse(response.body).at_css('loc').text + expect(url).to include('?page=2') + end + end + + describe '#news' do + let!(:sitemap) { Sitemap.touch(Sitemap::NEWS_SITEMAP_NAME) } + + before { Discourse.cache.delete("sitemap/news") } + + it 'returns a sitemap with topics bumped in the last 72 hours' do + topic = Fabricate(:topic, bumped_at: 71.hours.ago) + old_topic = Fabricate(:topic, bumped_at: 73.hours.ago) + + get '/news.xml' + urls = Nokogiri::XML::Document.parse(response.body).css('url') + loc = urls.first.at_css('loc').text + + expect(response.status).to eq(200) + expect(loc).to eq("#{Discourse.base_url}/t/#{topic.slug}/#{topic.id}") + + all_urls = urls.map { |u| u.at_css('loc').text } + expect(all_urls).not_to include("#{Discourse.base_url}/t/#{old_topic.slug}/#{old_topic.id}") + end + end +end