DEV: Add `DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT` env (#27199)

This commit adds a `DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT`
environment that will allow us to dump all backtraces for all threads of
a Unicorn worker 2 seconds before it times out. In development,
backtraces are dumped to `STDOUT` and in production we will dump it to
`unicorn.stdout.log`.

We want to dump all the backtraces to make it easier to identify the
cause of a Unicorn worker timing out.
This commit is contained in:
Alan Guo Xiang Tan 2024-05-27 12:20:38 +08:00 committed by GitHub
parent 3a91a92563
commit 6cafe59c76
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 43 additions and 0 deletions

View File

@ -268,4 +268,9 @@ end
after_fork do |server, worker| after_fork do |server, worker|
DiscourseEvent.trigger(:web_fork_started) DiscourseEvent.trigger(:web_fork_started)
Discourse.after_fork Discourse.after_fork
Signal.trap("USR2") { puts <<~MSG }
[#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] Received USR2 signal, dumping backtrace for all threads
#{Thread.list.map { |t| "#{t.backtrace&.join("\n")}" }.join("\n\n")}
MSG
end end

View File

@ -0,0 +1,38 @@
# frozen_string_literal: true
if ENV["DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT"] && defined?(Unicorn::HttpServer)
module UnicornHTTPServerPatch
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
def murder_lazy_workers
next_sleep = @timeout - 1
now = time_now.to_i
@workers.dup.each_pair do |wpid, worker|
tick = worker.tick
0 == tick and next # skip workers that haven't processed any clients
diff = now - tick
tmp = @timeout - diff
# START MONKEY PATCH
if tmp < 2
logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \
"(#{diff}s), sending USR2 to dump thread backtraces"
kill_worker(:USR2, wpid)
end
# END MONKEY PATCH
if tmp >= 0
next_sleep > tmp and next_sleep = tmp
next
end
next_sleep = 0
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
"(#{diff}s > #{@timeout}s), killing"
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
end
next_sleep <= 0 ? 1 : next_sleep
end
end
Unicorn::HttpServer.prepend(UnicornHTTPServerPatch)
end