DEV: Log Unicorn worker timeout backtraces to `Rails.logger` (#27257)
This commit introduces the following changes: 1. Introduce the `SignalTrapLogger` singleton which starts a single thread that polls a queue to log messages with the specified logger. This thread is necessary becasue most loggers cannot be used inside the `Signal.trap` context as they rely on mutexes which are not allowed within the context. 2. Moves the monkey patch in `freedom_patches/unicorn_http_server_patch.rb` to `config/unicorn.config.rb` which is already monkey patching `Unicorn::HttpServer`. 3. `Unicorn::HttpServer` will now automatically send a `USR2` signal to a unicorn worker 2 seconds before the worker is timed out by the Unicorn master. 4. When a Unicorn worker receives a `USR2` signal, it will now log only the main thread's backtraces to `Rails.logger`. Previously, it was `put`ing the backtraces to `STDOUT` which most people wouldn't read. Logging it via `Rails.logger` will make the backtraces easily accessible via `/logs`.
This commit is contained in:
parent
4b2bd4d682
commit
23c38cbf11
|
@ -84,14 +84,8 @@ before_fork do |server, worker|
|
|||
server.logger.info "starting #{sidekiqs} supervised sidekiqs"
|
||||
|
||||
require "demon/sidekiq"
|
||||
Demon::Sidekiq.logger = server.logger
|
||||
Demon::Sidekiq.after_fork { DiscourseEvent.trigger(:sidekiq_fork_started) }
|
||||
Demon::Sidekiq.start(sidekiqs)
|
||||
|
||||
Signal.trap("SIGTSTP") do
|
||||
Demon::Sidekiq.log("Issuing stop to Sidekiq")
|
||||
Demon::Sidekiq.stop
|
||||
end
|
||||
Demon::Sidekiq.start(sidekiqs, logger: server.logger)
|
||||
|
||||
# Trap USR1, so we can re-issue to sidekiq workers
|
||||
# but chain the default unicorn implementation as well
|
||||
|
@ -104,21 +98,48 @@ before_fork do |server, worker|
|
|||
|
||||
if ENV["DISCOURSE_ENABLE_EMAIL_SYNC_DEMON"] == "true"
|
||||
server.logger.info "starting up EmailSync demon"
|
||||
Demon::EmailSync.start
|
||||
Signal.trap("SIGTSTP") do
|
||||
STDERR.puts "#{Time.now}: Issuing stop to EmailSync"
|
||||
Demon::EmailSync.stop
|
||||
end
|
||||
Demon::EmailSync.start(1, logger: server.logger)
|
||||
end
|
||||
|
||||
DiscoursePluginRegistry.demon_processes.each do |demon_class|
|
||||
server.logger.info "starting #{demon_class.prefix} demon"
|
||||
demon_class.start
|
||||
demon_class.start(1, logger: server.logger)
|
||||
end
|
||||
|
||||
class ::Unicorn::HttpServer
|
||||
alias master_sleep_orig master_sleep
|
||||
|
||||
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
|
||||
def murder_lazy_workers
|
||||
next_sleep = @timeout - 1
|
||||
now = time_now.to_i
|
||||
@workers.dup.each_pair do |wpid, worker|
|
||||
tick = worker.tick
|
||||
0 == tick and next # skip workers that haven't processed any clients
|
||||
diff = now - tick
|
||||
tmp = @timeout - diff
|
||||
|
||||
# START MONKEY PATCH
|
||||
if tmp < 2
|
||||
logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \
|
||||
"(#{diff}s), sending USR2 to dump thread backtraces"
|
||||
kill_worker(:USR2, wpid)
|
||||
end
|
||||
# END MONKEY PATCH
|
||||
|
||||
if tmp >= 0
|
||||
next_sleep > tmp and next_sleep = tmp
|
||||
next
|
||||
end
|
||||
next_sleep = 0
|
||||
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
|
||||
"(#{diff}s > #{@timeout}s), killing"
|
||||
|
||||
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
|
||||
end
|
||||
next_sleep <= 0 ? 1 : next_sleep
|
||||
end
|
||||
|
||||
def max_sidekiq_rss
|
||||
rss =
|
||||
`ps -eo rss,args | grep sidekiq | grep -v grep | awk '{print $1}'`.split("\n")
|
||||
|
@ -270,9 +291,14 @@ end
|
|||
after_fork do |server, worker|
|
||||
DiscourseEvent.trigger(:web_fork_started)
|
||||
Discourse.after_fork
|
||||
SignalTrapLogger.instance.after_fork
|
||||
|
||||
Signal.trap("USR2") { puts <<~MSG }
|
||||
[#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] Received USR2 signal, dumping backtrace for all threads
|
||||
#{Thread.list.map { |t| "#{t.backtrace&.join("\n")}" }.join("\n\n")}
|
||||
Signal.trap("USR2") do
|
||||
message = <<~MSG
|
||||
Unicorn worker received USR2 signal indicating it is about to timeout, dumping backtrace for main thread
|
||||
#{Thread.current.backtrace&.join("\n")}
|
||||
MSG
|
||||
|
||||
SignalTrapLogger.instance.log(Rails.logger, message, level: :warn)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -9,9 +9,9 @@ class Demon::Base
|
|||
@demons
|
||||
end
|
||||
|
||||
def self.start(count = 1, verbose: false)
|
||||
def self.start(count = 1, verbose: false, logger: nil)
|
||||
@demons ||= {}
|
||||
count.times { |i| (@demons["#{prefix}_#{i}"] ||= new(i, verbose: verbose)).start }
|
||||
count.times { |i| (@demons["#{prefix}_#{i}"] ||= new(i, verbose:, logger:)).start }
|
||||
end
|
||||
|
||||
def self.stop
|
||||
|
@ -39,7 +39,7 @@ class Demon::Base
|
|||
attr_reader :pid, :parent_pid, :started, :index
|
||||
attr_accessor :stop_timeout
|
||||
|
||||
def initialize(index, rails_root: nil, parent_pid: nil, verbose: false)
|
||||
def initialize(index, rails_root: nil, parent_pid: nil, verbose: false, logger: nil)
|
||||
@index = index
|
||||
@pid = nil
|
||||
@parent_pid = parent_pid || Process.pid
|
||||
|
@ -47,6 +47,11 @@ class Demon::Base
|
|||
@stop_timeout = 10
|
||||
@rails_root = rails_root || Rails.root
|
||||
@verbose = verbose
|
||||
@logger = logger || Logger.new(STDERR)
|
||||
end
|
||||
|
||||
def log(message, level: :info)
|
||||
@logger.public_send(level, message)
|
||||
end
|
||||
|
||||
def pid_file
|
||||
|
@ -72,6 +77,7 @@ class Demon::Base
|
|||
|
||||
def stop
|
||||
@started = false
|
||||
|
||||
if @pid
|
||||
Process.kill(stop_signal, @pid)
|
||||
|
||||
|
@ -99,7 +105,7 @@ class Demon::Base
|
|||
wait_for_stop.call
|
||||
|
||||
if alive?
|
||||
STDERR.puts "Process would not terminate cleanly, force quitting. pid: #{@pid} #{self.class}"
|
||||
log("Process would not terminate cleanly, force quitting. pid: #{@pid} #{self.class}")
|
||||
Process.kill("KILL", @pid)
|
||||
end
|
||||
|
||||
|
@ -125,8 +131,9 @@ class Demon::Base
|
|||
rescue StandardError
|
||||
-1
|
||||
end
|
||||
|
||||
if dead
|
||||
STDERR.puts "Detected dead worker #{@pid}, restarting..."
|
||||
log("Detected dead worker #{@pid}, restarting...")
|
||||
@pid = nil
|
||||
@started = false
|
||||
start
|
||||
|
@ -138,7 +145,7 @@ class Demon::Base
|
|||
|
||||
if existing = already_running?
|
||||
# should not happen ... so kill violently
|
||||
STDERR.puts "Attempting to kill pid #{existing}"
|
||||
log("Attempting to kill pid #{existing}")
|
||||
Process.kill("TERM", existing)
|
||||
end
|
||||
|
||||
|
@ -199,7 +206,7 @@ class Demon::Base
|
|||
Process.kill "KILL", Process.pid
|
||||
end
|
||||
rescue => e
|
||||
STDERR.puts "URGENT monitoring thread had an exception #{e}"
|
||||
log("URGENT monitoring thread had an exception #{e}")
|
||||
end
|
||||
sleep 1
|
||||
end
|
||||
|
|
|
@ -77,14 +77,14 @@ class Demon::EmailSync < ::Demon::Base
|
|||
end
|
||||
|
||||
def after_fork
|
||||
puts "[EmailSync] Loading EmailSync in process id #{Process.pid}"
|
||||
log("[EmailSync] Loading EmailSync in process id #{Process.pid}")
|
||||
|
||||
loop do
|
||||
break if Discourse.redis.set(HEARTBEAT_KEY, Time.now.to_i, ex: HEARTBEAT_INTERVAL, nx: true)
|
||||
sleep HEARTBEAT_INTERVAL
|
||||
end
|
||||
|
||||
puts "[EmailSync] Starting EmailSync main thread"
|
||||
log("[EmailSync] Starting EmailSync main thread")
|
||||
|
||||
@running = true
|
||||
@sync_data = {}
|
||||
|
@ -158,18 +158,20 @@ class Demon::EmailSync < ::Demon::Base
|
|||
Discourse.redis.del(HEARTBEAT_KEY)
|
||||
exit 0
|
||||
rescue => e
|
||||
STDERR.puts e.message
|
||||
STDERR.puts e.backtrace.join("\n")
|
||||
log("#{e.message}: #{e.backtrace.join("\n")}")
|
||||
exit 1
|
||||
end
|
||||
|
||||
def kill_and_disconnect!(data)
|
||||
data[:thread].kill
|
||||
data[:thread].join
|
||||
|
||||
begin
|
||||
data[:syncer]&.disconnect!
|
||||
rescue Net::IMAP::ResponseError => err
|
||||
puts "[EmailSync] Encountered a response error when disconnecting: #{err}"
|
||||
log(
|
||||
"[EmailSync] Encountered a response error when disconnecting: #{err}\n#{err.backtrace.join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -3,8 +3,6 @@
|
|||
require "demon/base"
|
||||
|
||||
class Demon::Sidekiq < ::Demon::Base
|
||||
cattr_accessor :logger
|
||||
|
||||
def self.prefix
|
||||
"sidekiq"
|
||||
end
|
||||
|
@ -13,37 +11,6 @@ class Demon::Sidekiq < ::Demon::Base
|
|||
blk ? (@blk = blk) : @blk
|
||||
end
|
||||
|
||||
def self.format(message)
|
||||
"[#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] #{message}"
|
||||
end
|
||||
|
||||
def self.log(message, level: :info)
|
||||
# We use an IO pipe and log messages using the logger in a seperate thread to avoid the `log writing failed. can't be called from trap context`
|
||||
# error message that is raised when trying to log from within a `Signal.trap` block.
|
||||
if logger
|
||||
if !defined?(@logger_read_pipe)
|
||||
@logger_read_pipe, @logger_write_pipe = IO.pipe
|
||||
|
||||
@logger_thread =
|
||||
Thread.new do
|
||||
begin
|
||||
while readable_io = IO.select([@logger_read_pipe])
|
||||
logger.public_send(level, readable_io.first[0].gets.strip)
|
||||
end
|
||||
rescue => e
|
||||
STDOUT.puts self.format(
|
||||
"Error in Sidekiq demon logger thread: #{e.message}\n#{e.backtrace.join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@logger_write_pipe.puts(message)
|
||||
else
|
||||
STDOUT.puts self.format(message)
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def suppress_stdout
|
||||
|
@ -54,12 +21,13 @@ class Demon::Sidekiq < ::Demon::Base
|
|||
false
|
||||
end
|
||||
|
||||
def log(message, level: :info)
|
||||
self.class.log(message, level:)
|
||||
def log_in_trap(message, level: :info)
|
||||
SignalTrapLogger.instance.log(@logger, message, level: level)
|
||||
end
|
||||
|
||||
def after_fork
|
||||
Demon::Sidekiq.after_fork&.call
|
||||
SignalTrapLogger.instance.after_fork
|
||||
|
||||
log("Loading Sidekiq in process id #{Process.pid}")
|
||||
require "sidekiq/cli"
|
||||
|
@ -67,9 +35,9 @@ class Demon::Sidekiq < ::Demon::Base
|
|||
|
||||
# Unicorn uses USR1 to indicate that log files have been rotated
|
||||
Signal.trap("USR1") do
|
||||
log("Sidekiq reopening logs...")
|
||||
log_in_trap("Sidekiq reopening logs...")
|
||||
Unicorn::Util.reopen_logs
|
||||
log("Sidekiq done reopening logs...")
|
||||
log_in_trap("Sidekiq done reopening logs...")
|
||||
end
|
||||
|
||||
options = ["-c", GlobalSetting.sidekiq_workers.to_s]
|
||||
|
@ -91,8 +59,7 @@ class Demon::Sidekiq < ::Demon::Base
|
|||
load Rails.root + "config/initializers/100-sidekiq.rb"
|
||||
cli.run
|
||||
rescue => e
|
||||
STDERR.puts e.message
|
||||
STDERR.puts e.backtrace.join("\n")
|
||||
log("Error encountered while starting Sidekiq: #{e.message}\n#{e.backtrace.join("\n")}")
|
||||
exit 1
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
if ENV["DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT"] && defined?(Unicorn::HttpServer)
|
||||
module UnicornHTTPServerPatch
|
||||
# Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496
|
||||
def murder_lazy_workers
|
||||
next_sleep = @timeout - 1
|
||||
now = time_now.to_i
|
||||
@workers.dup.each_pair do |wpid, worker|
|
||||
tick = worker.tick
|
||||
0 == tick and next # skip workers that haven't processed any clients
|
||||
diff = now - tick
|
||||
tmp = @timeout - diff
|
||||
|
||||
# START MONKEY PATCH
|
||||
if tmp < 2
|
||||
logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \
|
||||
"(#{diff}s), sending USR2 to dump thread backtraces"
|
||||
kill_worker(:USR2, wpid)
|
||||
end
|
||||
# END MONKEY PATCH
|
||||
|
||||
if tmp >= 0
|
||||
next_sleep > tmp and next_sleep = tmp
|
||||
next
|
||||
end
|
||||
next_sleep = 0
|
||||
logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \
|
||||
"(#{diff}s > #{@timeout}s), killing"
|
||||
|
||||
kill_worker(:KILL, wpid) # take no prisoners for timeout violations
|
||||
end
|
||||
next_sleep <= 0 ? 1 : next_sleep
|
||||
end
|
||||
end
|
||||
|
||||
Unicorn::HttpServer.prepend(UnicornHTTPServerPatch)
|
||||
end
|
|
@ -0,0 +1,49 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
# This class is used to log messages to a specified logger from within a `Signal.trap` context. Most loggers rely on
|
||||
# methods that are prohibited within a `Signal.trap` context, so this class is used to queue up log messages and then
|
||||
# log them from a separate thread outside of the `Signal.trap` context.
|
||||
#
|
||||
# Example:
|
||||
# Signal.trap("USR1") do
|
||||
# SignalTrapLogger.instance.log(Rails.logger, "Received USR1 signal")
|
||||
# end
|
||||
#
|
||||
# Do note that you need to call `SignalTrapLogger.instance.after_fork` after forking a new process to ensure that the
|
||||
# logging thread is running in the new process.
|
||||
class SignalTrapLogger
|
||||
include Singleton
|
||||
|
||||
def initialize
|
||||
@queue = Queue.new
|
||||
ensure_logging_thread_running
|
||||
end
|
||||
|
||||
def log(logger, message, level: :info)
|
||||
@queue << { logger:, message:, level: }
|
||||
end
|
||||
|
||||
def after_fork
|
||||
ensure_logging_thread_running
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def ensure_logging_thread_running
|
||||
return if @thread&.alive?
|
||||
|
||||
@thread =
|
||||
Thread.new do
|
||||
loop do
|
||||
begin
|
||||
log_entry = @queue.pop
|
||||
log_entry[:logger].public_send(log_entry[:level], log_entry[:message])
|
||||
rescue => error
|
||||
Rails.logger.error(
|
||||
"Error in SignalTrapLogger thread: #{error.message}\n#{error.backtrace.join("\n")}",
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue