From 4967541275dad9493a31b23213557d4787bba774 Mon Sep 17 00:00:00 2001 From: Sam Date: Fri, 5 Aug 2022 17:40:22 +1000 Subject: [PATCH] FIX: properly log all internal job failures (#17805) Our internal implementation of #perform on jobs performs remapping. This happens cause we do "exception aggregation". Scheduled jobs run on every site in the multisite cluster, and we report one error per site that failed. During this aggregation we reshape the context from the original object shape returned by mini_scheduler The new integration test ensures this interface will remain stable even if decoupled parts of the code change shapes. Co-authored-by: Alan Guo Xiang Tan --- lib/discourse.rb | 6 ++++++ spec/jobs/jobs_base_spec.rb | 28 +++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/lib/discourse.rb b/lib/discourse.rb index 5f7334d12e4..43659294fdc 100644 --- a/lib/discourse.rb +++ b/lib/discourse.rb @@ -177,6 +177,7 @@ module Discourse job = context[:job] + # mini_scheduler direct reporting if Hash === job job_class = job["class"] if job_class @@ -184,6 +185,11 @@ module Discourse end end + # internal reporting + if job.class == Class && ::Jobs::Base > job + job_exception_stats[job] += 1 + end + cm = RailsMultisite::ConnectionManagement parent_logger.handle_exception(ex, { current_db: cm.current_db, diff --git a/spec/jobs/jobs_base_spec.rb b/spec/jobs/jobs_base_spec.rb index 5225019b03d..619ab0f8cd9 100644 --- a/spec/jobs/jobs_base_spec.rb +++ b/spec/jobs/jobs_base_spec.rb @@ -10,12 +10,15 @@ RSpec.describe ::Jobs::Base do end class BadJob < ::Jobs::Base + class BadJobError < StandardError + end + attr_accessor :fail_count def execute(args) @fail_count ||= 0 @fail_count += 1 - raise StandardError + raise BadJobError end end @@ -35,6 +38,29 @@ RSpec.describe ::Jobs::Base do expect(bad.fail_count).to eq(3) end + describe '#perform' do + context 'when a job raises an error' do + before do + Discourse.reset_job_exception_stats! + end + + after do + Discourse.reset_job_exception_stats! + end + + it 'collects stats for failing jobs in Discourse.job_exception_stats' do + bad = BadJob.new + 3.times do + # During test env handle_job_exception errors out + # in production this is suppressed + expect { bad.perform({}) }.to raise_error(BadJob::BadJobError) + end + + expect(Discourse.job_exception_stats).to eq({ BadJob => 3 }) + end + end + end + it 'delegates the process call to execute' do ::Jobs::Base.any_instance.expects(:execute).with('hello' => 'world') ::Jobs::Base.new.perform('hello' => 'world', 'sync_exec' => true)