FIX: properly log all internal job failures (#17805)

Our internal implementation of #perform on jobs performs remapping.

This happens cause we do "exception aggregation".

Scheduled jobs run on every site in the multisite cluster, and we report
one error per site that failed. During this aggregation we reshape the
context from the original object shape returned by mini_scheduler

The new integration test ensures this interface will remain stable even if
decoupled parts of the code change shapes.

Co-authored-by: Alan Guo Xiang Tan <gxtan1990@gmail.com>
This commit is contained in:
Sam 2022-08-05 17:40:22 +10:00 committed by GitHub
parent 0df1c4eab2
commit 4967541275
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 1 deletions

View File

@ -177,6 +177,7 @@ module Discourse
job = context[:job] job = context[:job]
# mini_scheduler direct reporting
if Hash === job if Hash === job
job_class = job["class"] job_class = job["class"]
if job_class if job_class
@ -184,6 +185,11 @@ module Discourse
end end
end end
# internal reporting
if job.class == Class && ::Jobs::Base > job
job_exception_stats[job] += 1
end
cm = RailsMultisite::ConnectionManagement cm = RailsMultisite::ConnectionManagement
parent_logger.handle_exception(ex, { parent_logger.handle_exception(ex, {
current_db: cm.current_db, current_db: cm.current_db,

View File

@ -10,12 +10,15 @@ RSpec.describe ::Jobs::Base do
end end
class BadJob < ::Jobs::Base class BadJob < ::Jobs::Base
class BadJobError < StandardError
end
attr_accessor :fail_count attr_accessor :fail_count
def execute(args) def execute(args)
@fail_count ||= 0 @fail_count ||= 0
@fail_count += 1 @fail_count += 1
raise StandardError raise BadJobError
end end
end end
@ -35,6 +38,29 @@ RSpec.describe ::Jobs::Base do
expect(bad.fail_count).to eq(3) expect(bad.fail_count).to eq(3)
end end
describe '#perform' do
context 'when a job raises an error' do
before do
Discourse.reset_job_exception_stats!
end
after do
Discourse.reset_job_exception_stats!
end
it 'collects stats for failing jobs in Discourse.job_exception_stats' do
bad = BadJob.new
3.times do
# During test env handle_job_exception errors out
# in production this is suppressed
expect { bad.perform({}) }.to raise_error(BadJob::BadJobError)
end
expect(Discourse.job_exception_stats).to eq({ BadJob => 3 })
end
end
end
it 'delegates the process call to execute' do it 'delegates the process call to execute' do
::Jobs::Base.any_instance.expects(:execute).with('hello' => 'world') ::Jobs::Base.any_instance.expects(:execute).with('hello' => 'world')
::Jobs::Base.new.perform('hello' => 'world', 'sync_exec' => true) ::Jobs::Base.new.perform('hello' => 'world', 'sync_exec' => true)