druid/examples/bin/supervise

397 lines
11 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env perl
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
use strict;
use warnings;
use Cwd qw/realpath/;
use POSIX qw/:sys_wait_h mkfifo setsid/;
use Fcntl qw/:DEFAULT :flock/;
use Getopt::Long qw/:config require_order gnu_compat/;
use FindBin;
use File::Spec;
use File::Copy;
sub logdie($)
{
my ($msg) = @_;
chomp $msg;
die "[" . (scalar localtime()) . "] $msg\n";
}
sub logit($)
{
my ($msg) = @_;
chomp $msg;
warn "[" . (scalar localtime()) . "] $msg\n";
}
sub usage
{
die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--svlogd <optional conf file>]\n";
}
sub read_config_file
{
my ($config_file) = @_;
open my $config_fh, "<", $config_file
or die "open $config_file: $!";
my @commands;
my @verify;
my $kill_timeout;
while (my $line = <$config_fh>) {
chomp $line;
next if $line =~ /^(\s*\#.*|\s*)$/;
if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
my $name = $1;
my $order = 50;
my $command = $2;
if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) {
$order = $1;
$name = $2;
}
if ($name eq ':verify') {
push @verify, $command;
} elsif ($name eq ':kill-timeout') {
$kill_timeout = int($command);
} else {
die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands;
push @commands, {
name => $name,
command => $command,
order => $order, # Stop order for this command
pid => 0, # Current pid, or 0 if not running
down => 0, # Time the proc should be down until
killed => 0, # Signal we sent to this process
restarting => 0, # True if this command is currently restarting
};
}
} else {
die "Syntax error: $line\n";
}
}
close $config_fh;
return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout };
}
sub stringify_exit_status
{
my ($status) = @_;
my $string;
my $signal = $status & 127;
my $cored = $status & 128;
my $code = $status >> 8;
if ($signal) {
$string = "signal = $signal";
} else {
$string = "exited = $code";
}
if ($cored) {
$string = $string . ", dumped core";
}
return $string;
}
sub open_control_fifo
{
my ($svdir) = @_;
my $fifofile = "$svdir/.ctrl";
if (-e $fifofile) {
unlink $fifofile or die "Cannot remove fifo: $fifofile\n";
}
mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n";
sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n";
return $fifofh;
}
sub pretty
{
my ($text, $color) = @_;
if (-t STDERR) {
if ($color eq 'bold') {
return "\x1b[1m$text\x1b[0m";
} elsif ($color eq 'red') {
return "\x1b[31m\x1b[1m$text\x1b[0m";
} else {
return $text;
}
} else {
return $text;
}
}
my @commands;
# If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal
my $killed = 0;
# If >0 then kill -9 all procs at this time
my $killkill = 0;
# Current proc order we're stopping. Ignored unless $killed is nonzero
my $stopping = 100;
# We'll do our own reaping
$SIG{CHLD} = sub {};
# Redirect stderr to stdout
open STDERR, ">&STDOUT" or die;
# Parse arguments
my %opt = (
'chdir' => realpath("$FindBin::Bin/.."),
'vardir' => realpath("$FindBin::Bin/../var"),
'kill-timeout' => 360,
);
usage() unless GetOptions(
\%opt,
'conf|c=s',
'vardir|d=s',
'kill-timeout|t=i',
'chdir=s',
'svlogd:s'
);
usage() unless $opt{'conf'} && $opt{'vardir'};
# Read config file
my $config = read_config_file($opt{'conf'});
@commands = @{$config->{commands}};
if (!@commands) {
die "Nothing to run.\n";
}
# Potentially override --kill-timeout
if (defined $config->{'kill-timeout'}) {
$opt{'kill-timeout'} = $config->{'kill-timeout'};
}
# Remember where vardir, logdir, svdir are after chdiring
my $vardir = File::Spec->rel2abs($opt{vardir});
my $logdir = File::Spec->rel2abs(realpath($ENV{'DRUID_LOG_DIR'} || "$FindBin::Bin/../log"));
my $svdir = "$vardir/sv";
# chdir to the root of the distribution (or whereever)
chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n";
# Create vardir with tmp/
if (! -e "$vardir/tmp") {
system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n";
}
# Create svdir
if (! -e $svdir) {
system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n";
}
# Create logdir, if needed
if (!defined $opt{svlogd} && ! -e "$logdir") {
system("mkdir -p \Q$logdir\E") == 0 or die "mkdir $logdir failed: $!\n";
}
# Lock svdir and keep it locked until we exit
my $lockfile = "$svdir/.lock";
open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n";
flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n";
# Create control fifo in svdir
my $fifofh = open_control_fifo($svdir);
# Run verification commands
for my $verify_cmd (@{$config->{verify}}) {
system($verify_cmd) == 0 or exit 1;
}
# Catch killy signals and do an orderly shutdown
$SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } };
# Build up control fifo command over multiple sysreads, potentially
my $fifobuffer = '';
if (defined $opt{svlogd}) {
logit "Staring services with log directory [svdir].";
} else {
logit "Starting services with log directory [$logdir].";
}
while (1) {
# Spawn new procs
if (!$killed) {
for my $command (grep { !$_->{pid} } @commands) {
if ($command->{down} < time) {
if (my $pid = fork) {
$command->{pid} = $pid;
} else {
setsid;
if (defined $opt{'svlogd'}) {
# If using svlogd, program output goes into the service directory. We do not use $logdir here.
my $logfile = "$svdir/$command->{name}";
logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}";
if (! -e $logfile) {
system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n";
}
if ($opt{'svlogd'}) {
copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!";
} else {
open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n";
print $configfh "s100000000\nn10\nN5\nt604800";
close $configfh;
}
open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n";
} else {
# If not using svlogd, program output goes to $logdir. In the default configuration, this will be a small
# amount of logging from the JVM itself, because all of the Druid and ZooKeeper logs are written into
# separate files by log4j2.
logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}";
my $logfile = "$logdir/$command->{name}.stdout.log";
open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n";
}
open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n";
exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!";
}
}
}
}
# Reap dead procs
my $pid;
while (($pid = waitpid(-1, WNOHANG)) > 0) {
my $status = $?;
my ($command) = (grep { $_->{pid} eq $pid } @commands);
if ($command) {
$command->{pid} = 0;
$command->{down} = time + 2;
logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")";
if ($status && !$killed && !$command->{restarting}) {
# Unexpected exit
logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see its logfile for more details";
}
$command->{restarting} = 0;
} else {
logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")";
}
}
# Kill procs, maybe
if ($killed) {
my $should_killkill = time > $killkill;
# Update stopping position, maybe
if ($should_killkill) {
$stopping = 0;
} else {
my $maxorder = 0;
for my $command (grep { $_->{pid} } @commands) {
if ($command->{order} > $maxorder) {
$maxorder = $command->{order};
}
}
if ($maxorder < $stopping) {
$stopping = $maxorder;
}
}
for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) {
my $want_signal;
if ($command->{killed} == 9 || $should_killkill) {
$want_signal = 9;
} else {
$want_signal = 15;
}
if ($command->{killed} != $want_signal) {
if ($want_signal != 9) {
my $kt = $opt{'kill-timeout'};
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s).";
} else {
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "].";
}
kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}";
$command->{killed} = $want_signal;
}
}
}
# Kill ourselves, maybe
if ($killed && ! grep { $_->{pid} } @commands) {
logit "Exiting.";
$SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT';
if ($killed > 0) {
kill $killed, $$;
exit 1;
} else {
# Normal exit
exit 0;
}
}
# Be controlled, maybe
my $fifostr = "";
if (sysread $fifofh, $fifostr, 4096) {
$fifobuffer .= $fifostr;
while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) {
my $fifocmd = $1;
$fifobuffer = $2;
if ($fifocmd =~ /^k (.+)$/ && !$killed) {
my $name = $1;
my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands;
if ($command) {
logit "Restarting command[" . pretty($name, "bold") . "].";
if (kill TERM => $command->{pid}) {
$command->{restarting} = 1;
} else {
logit "WARN: Could not signal pid: $command->{pid}"
}
} else {
logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring.";
}
} elsif ($fifocmd eq 'd') {
# -1 means exit without signal
$killed = -1;
$killkill = time + $opt{'kill-timeout'}
} else {
logit "Received unknown control command, ignoring.";
}
}
}
sleep 1;
}
exit 0;