From 23c38cbf1189d90384c28654d11ab4a65191ed03 Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Mon, 3 Jun 2024 12:51:12 +0800 Subject: [PATCH] DEV: Log Unicorn worker timeout backtraces to `Rails.logger` (#27257) This commit introduces the following changes: 1. Introduce the `SignalTrapLogger` singleton which starts a single thread that polls a queue to log messages with the specified logger. This thread is necessary becasue most loggers cannot be used inside the `Signal.trap` context as they rely on mutexes which are not allowed within the context. 2. Moves the monkey patch in `freedom_patches/unicorn_http_server_patch.rb` to `config/unicorn.config.rb` which is already monkey patching `Unicorn::HttpServer`. 3. `Unicorn::HttpServer` will now automatically send a `USR2` signal to a unicorn worker 2 seconds before the worker is timed out by the Unicorn master. 4. When a Unicorn worker receives a `USR2` signal, it will now log only the main thread's backtraces to `Rails.logger`. Previously, it was `put`ing the backtraces to `STDOUT` which most people wouldn't read. Logging it via `Rails.logger` will make the backtraces easily accessible via `/logs`. --- config/unicorn.conf.rb | 58 ++++++++++++++----- lib/demon/base.rb | 21 ++++--- lib/demon/email_sync.rb | 12 ++-- lib/demon/sidekiq.rb | 45 ++------------ .../unicorn_http_server_patch.rb | 38 ------------ lib/signal_trap_logger.rb | 49 ++++++++++++++++ 6 files changed, 118 insertions(+), 105 deletions(-) delete mode 100644 lib/freedom_patches/unicorn_http_server_patch.rb create mode 100644 lib/signal_trap_logger.rb diff --git a/config/unicorn.conf.rb b/config/unicorn.conf.rb index c3708e9edf2..5d377cfc31e 100644 --- a/config/unicorn.conf.rb +++ b/config/unicorn.conf.rb @@ -84,14 +84,8 @@ before_fork do |server, worker| server.logger.info "starting #{sidekiqs} supervised sidekiqs" require "demon/sidekiq" - Demon::Sidekiq.logger = server.logger Demon::Sidekiq.after_fork { DiscourseEvent.trigger(:sidekiq_fork_started) } - Demon::Sidekiq.start(sidekiqs) - - Signal.trap("SIGTSTP") do - Demon::Sidekiq.log("Issuing stop to Sidekiq") - Demon::Sidekiq.stop - end + Demon::Sidekiq.start(sidekiqs, logger: server.logger) # Trap USR1, so we can re-issue to sidekiq workers # but chain the default unicorn implementation as well @@ -104,21 +98,48 @@ before_fork do |server, worker| if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true" server.logger.info "starting up EmailSync demon" - Demon::EmailSync.start - Signal.trap("SIGTSTP") do - STDERR.puts "#{Time.now}: Issuing stop to EmailSync" - Demon::EmailSync.stop - end + Demon::EmailSync.start(1, logger: server.logger) end DiscoursePluginRegistry.demon_processes.each do |demon_class| server.logger.info "starting #{demon_class.prefix} demon" - demon_class.start + demon_class.start(1, logger: server.logger) end class ::Unicorn::HttpServer alias master_sleep_orig master_sleep + # Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496 + def murder_lazy_workers + next_sleep = @timeout - 1 + now = time_now.to_i + @workers.dup.each_pair do |wpid, worker| + tick = worker.tick + 0 == tick and next # skip workers that haven't processed any clients + diff = now - tick + tmp = @timeout - diff + + # START MONKEY PATCH + if tmp < 2 + logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \ + "(#{diff}s), sending USR2 to dump thread backtraces" + kill_worker(:USR2, wpid) + end + # END MONKEY PATCH + + if tmp >= 0 + next_sleep > tmp and next_sleep = tmp + next + end + next_sleep = 0 + logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \ + "(#{diff}s > #{@timeout}s), killing" + + kill_worker(:KILL, wpid) # take no prisoners for timeout violations + end + next_sleep <= 0 ? 1 : next_sleep + end + def max_sidekiq_rss rss = `ps -eo rss,args | grep sidekiq | grep -v grep | awk '{print $1}'`.split("\n") @@ -270,9 +291,14 @@ end after_fork do |server, worker| DiscourseEvent.trigger(:web_fork_started) Discourse.after_fork + SignalTrapLogger.instance.after_fork - Signal.trap("USR2") { puts <<~MSG } - [#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] Received USR2 signal, dumping backtrace for all threads - #{Thread.list.map { |t| "#{t.backtrace&.join("\n")}" }.join("\n\n")} + Signal.trap("USR2") do + message = <<~MSG + Unicorn worker received USR2 signal indicating it is about to timeout, dumping backtrace for main thread + #{Thread.current.backtrace&.join("\n")} MSG + + SignalTrapLogger.instance.log(Rails.logger, message, level: :warn) + end end diff --git a/lib/demon/base.rb b/lib/demon/base.rb index c49265a4cfb..5ab23c2a3ad 100644 --- a/lib/demon/base.rb +++ b/lib/demon/base.rb @@ -9,9 +9,9 @@ class Demon::Base @demons end - def self.start(count = 1, verbose: false) + def self.start(count = 1, verbose: false, logger: nil) @demons ||= {} - count.times { |i| (@demons["#{prefix}_#{i}"] ||= new(i, verbose: verbose)).start } + count.times { |i| (@demons["#{prefix}_#{i}"] ||= new(i, verbose:, logger:)).start } end def self.stop @@ -39,7 +39,7 @@ class Demon::Base attr_reader :pid, :parent_pid, :started, :index attr_accessor :stop_timeout - def initialize(index, rails_root: nil, parent_pid: nil, verbose: false) + def initialize(index, rails_root: nil, parent_pid: nil, verbose: false, logger: nil) @index = index @pid = nil @parent_pid = parent_pid || Process.pid @@ -47,6 +47,11 @@ class Demon::Base @stop_timeout = 10 @rails_root = rails_root || Rails.root @verbose = verbose + @logger = logger || Logger.new(STDERR) + end + + def log(message, level: :info) + @logger.public_send(level, message) end def pid_file @@ -72,6 +77,7 @@ class Demon::Base def stop @started = false + if @pid Process.kill(stop_signal, @pid) @@ -99,7 +105,7 @@ class Demon::Base wait_for_stop.call if alive? - STDERR.puts "Process would not terminate cleanly, force quitting. pid: #{@pid} #{self.class}" + log("Process would not terminate cleanly, force quitting. pid: #{@pid} #{self.class}") Process.kill("KILL", @pid) end @@ -125,8 +131,9 @@ class Demon::Base rescue StandardError -1 end + if dead - STDERR.puts "Detected dead worker #{@pid}, restarting..." + log("Detected dead worker #{@pid}, restarting...") @pid = nil @started = false start @@ -138,7 +145,7 @@ class Demon::Base if existing = already_running? # should not happen ... so kill violently - STDERR.puts "Attempting to kill pid #{existing}" + log("Attempting to kill pid #{existing}") Process.kill("TERM", existing) end @@ -199,7 +206,7 @@ class Demon::Base Process.kill "KILL", Process.pid end rescue => e - STDERR.puts "URGENT monitoring thread had an exception #{e}" + log("URGENT monitoring thread had an exception #{e}") end sleep 1 end diff --git a/lib/demon/email_sync.rb b/lib/demon/email_sync.rb index 8658ea7f673..736f3adfc48 100644 --- a/lib/demon/email_sync.rb +++ b/lib/demon/email_sync.rb @@ -77,14 +77,14 @@ class Demon::EmailSync < ::Demon::Base end def after_fork - puts "[EmailSync] Loading EmailSync in process id #{Process.pid}" + log("[EmailSync] Loading EmailSync in process id #{Process.pid}") loop do break if Discourse.redis.set(HEARTBEAT_KEY, Time.now.to_i, ex: HEARTBEAT_INTERVAL, nx: true) sleep HEARTBEAT_INTERVAL end - puts "[EmailSync] Starting EmailSync main thread" + log("[EmailSync] Starting EmailSync main thread") @running = true @sync_data = {} @@ -158,18 +158,20 @@ class Demon::EmailSync < ::Demon::Base Discourse.redis.del(HEARTBEAT_KEY) exit 0 rescue => e - STDERR.puts e.message - STDERR.puts e.backtrace.join("\n") + log("#{e.message}: #{e.backtrace.join("\n")}") exit 1 end def kill_and_disconnect!(data) data[:thread].kill data[:thread].join + begin data[:syncer]&.disconnect! rescue Net::IMAP::ResponseError => err - puts "[EmailSync] Encountered a response error when disconnecting: #{err}" + log( + "[EmailSync] Encountered a response error when disconnecting: #{err}\n#{err.backtrace.join("\n")}", + ) end end end diff --git a/lib/demon/sidekiq.rb b/lib/demon/sidekiq.rb index def77f03bea..17bb18b7998 100644 --- a/lib/demon/sidekiq.rb +++ b/lib/demon/sidekiq.rb @@ -3,8 +3,6 @@ require "demon/base" class Demon::Sidekiq < ::Demon::Base - cattr_accessor :logger - def self.prefix "sidekiq" end @@ -13,37 +11,6 @@ class Demon::Sidekiq < ::Demon::Base blk ? (@blk = blk) : @blk end - def self.format(message) - "[#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] #{message}" - end - - def self.log(message, level: :info) - # We use an IO pipe and log messages using the logger in a seperate thread to avoid the `log writing failed. can't be called from trap context` - # error message that is raised when trying to log from within a `Signal.trap` block. - if logger - if !defined?(@logger_read_pipe) - @logger_read_pipe, @logger_write_pipe = IO.pipe - - @logger_thread = - Thread.new do - begin - while readable_io = IO.select([@logger_read_pipe]) - logger.public_send(level, readable_io.first[0].gets.strip) - end - rescue => e - STDOUT.puts self.format( - "Error in Sidekiq demon logger thread: #{e.message}\n#{e.backtrace.join("\n")}", - ) - end - end - end - - @logger_write_pipe.puts(message) - else - STDOUT.puts self.format(message) - end - end - private def suppress_stdout @@ -54,12 +21,13 @@ class Demon::Sidekiq < ::Demon::Base false end - def log(message, level: :info) - self.class.log(message, level:) + def log_in_trap(message, level: :info) + SignalTrapLogger.instance.log(@logger, message, level: level) end def after_fork Demon::Sidekiq.after_fork&.call + SignalTrapLogger.instance.after_fork log("Loading Sidekiq in process id #{Process.pid}") require "sidekiq/cli" @@ -67,9 +35,9 @@ class Demon::Sidekiq < ::Demon::Base # Unicorn uses USR1 to indicate that log files have been rotated Signal.trap("USR1") do - log("Sidekiq reopening logs...") + log_in_trap("Sidekiq reopening logs...") Unicorn::Util.reopen_logs - log("Sidekiq done reopening logs...") + log_in_trap("Sidekiq done reopening logs...") end options = ["-c", GlobalSetting.sidekiq_workers.to_s] @@ -91,8 +59,7 @@ class Demon::Sidekiq < ::Demon::Base load Rails.root + "config/initializers/100-sidekiq.rb" cli.run rescue => e - STDERR.puts e.message - STDERR.puts e.backtrace.join("\n") + log("Error encountered while starting Sidekiq: #{e.message}\n#{e.backtrace.join("\n")}") exit 1 end end diff --git a/lib/freedom_patches/unicorn_http_server_patch.rb b/lib/freedom_patches/unicorn_http_server_patch.rb deleted file mode 100644 index 6edb04154c5..00000000000 --- a/lib/freedom_patches/unicorn_http_server_patch.rb +++ /dev/null @@ -1,38 +0,0 @@ -# frozen_string_literal: true - -if ENV["DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT"] && defined?(Unicorn::HttpServer) - module UnicornHTTPServerPatch - # Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496 - def murder_lazy_workers - next_sleep = @timeout - 1 - now = time_now.to_i - @workers.dup.each_pair do |wpid, worker| - tick = worker.tick - 0 == tick and next # skip workers that haven't processed any clients - diff = now - tick - tmp = @timeout - diff - - # START MONKEY PATCH - if tmp < 2 - logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \ - "(#{diff}s), sending USR2 to dump thread backtraces" - kill_worker(:USR2, wpid) - end - # END MONKEY PATCH - - if tmp >= 0 - next_sleep > tmp and next_sleep = tmp - next - end - next_sleep = 0 - logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \ - "(#{diff}s > #{@timeout}s), killing" - - kill_worker(:KILL, wpid) # take no prisoners for timeout violations - end - next_sleep <= 0 ? 1 : next_sleep - end - end - - Unicorn::HttpServer.prepend(UnicornHTTPServerPatch) -end diff --git a/lib/signal_trap_logger.rb b/lib/signal_trap_logger.rb new file mode 100644 index 00000000000..fcedd18a48a --- /dev/null +++ b/lib/signal_trap_logger.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +# This class is used to log messages to a specified logger from within a `Signal.trap` context. Most loggers rely on +# methods that are prohibited within a `Signal.trap` context, so this class is used to queue up log messages and then +# log them from a separate thread outside of the `Signal.trap` context. +# +# Example: +# Signal.trap("USR1") do +# SignalTrapLogger.instance.log(Rails.logger, "Received USR1 signal") +# end +# +# Do note that you need to call `SignalTrapLogger.instance.after_fork` after forking a new process to ensure that the +# logging thread is running in the new process. +class SignalTrapLogger + include Singleton + + def initialize + @queue = Queue.new + ensure_logging_thread_running + end + + def log(logger, message, level: :info) + @queue << { logger:, message:, level: } + end + + def after_fork + ensure_logging_thread_running + end + + private + + def ensure_logging_thread_running + return if @thread&.alive? + + @thread = + Thread.new do + loop do + begin + log_entry = @queue.pop + log_entry[:logger].public_send(log_entry[:level], log_entry[:message]) + rescue => error + Rails.logger.error( + "Error in SignalTrapLogger thread: #{error.message}\n#{error.backtrace.join("\n")}", + ) + end + end + end + end +end