2018-08-09 16:37:52 -04:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
# or more contributor license agreements. See the NOTICE file
|
|
|
|
# distributed with this work for additional information
|
|
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
|
|
# to you under the Apache License, Version 2.0 (the
|
|
|
|
# "License"); you may not use this file except in compliance
|
|
|
|
# with the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing,
|
|
|
|
# software distributed under the License is distributed on an
|
|
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
# KIND, either express or implied. See the License for the
|
|
|
|
# specific language governing permissions and limitations
|
|
|
|
# under the License.
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings;
|
|
|
|
use Cwd qw/realpath/;
|
|
|
|
use POSIX qw/:sys_wait_h mkfifo setsid/;
|
|
|
|
use Fcntl qw/:DEFAULT :flock/;
|
|
|
|
use Getopt::Long qw/:config require_order gnu_compat/;
|
|
|
|
use FindBin;
|
|
|
|
use File::Spec;
|
|
|
|
use File::Copy;
|
|
|
|
|
|
|
|
sub logdie($)
|
|
|
|
{
|
|
|
|
my ($msg) = @_;
|
|
|
|
chomp $msg;
|
|
|
|
die "[" . (scalar localtime()) . "] $msg\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
sub logit($)
|
|
|
|
{
|
|
|
|
my ($msg) = @_;
|
|
|
|
chomp $msg;
|
|
|
|
warn "[" . (scalar localtime()) . "] $msg\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
sub usage
|
|
|
|
{
|
|
|
|
die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--svlogd <optional conf file>]\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
sub read_config_file
|
|
|
|
{
|
|
|
|
my ($config_file) = @_;
|
|
|
|
|
|
|
|
open my $config_fh, "<", $config_file
|
|
|
|
or die "open $config_file: $!";
|
|
|
|
|
|
|
|
my @commands;
|
|
|
|
my @verify;
|
|
|
|
my $kill_timeout;
|
|
|
|
while (my $line = <$config_fh>) {
|
|
|
|
chomp $line;
|
|
|
|
next if $line =~ /^(\s*\#.*|\s*)$/;
|
|
|
|
|
|
|
|
if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
|
|
|
|
my $name = $1;
|
|
|
|
my $order = 50;
|
|
|
|
my $command = $2;
|
|
|
|
|
|
|
|
if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) {
|
|
|
|
$order = $1;
|
|
|
|
$name = $2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($name eq ':verify') {
|
|
|
|
push @verify, $command;
|
|
|
|
} elsif ($name eq ':kill-timeout') {
|
|
|
|
$kill_timeout = int($command);
|
|
|
|
} else {
|
|
|
|
die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands;
|
|
|
|
push @commands, {
|
|
|
|
name => $name,
|
|
|
|
command => $command,
|
|
|
|
order => $order, # Stop order for this command
|
|
|
|
pid => 0, # Current pid, or 0 if not running
|
|
|
|
down => 0, # Time the proc should be down until
|
|
|
|
killed => 0, # Signal we sent to this process
|
|
|
|
restarting => 0, # True if this command is currently restarting
|
|
|
|
};
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
die "Syntax error: $line\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
close $config_fh;
|
|
|
|
return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout };
|
|
|
|
}
|
|
|
|
|
|
|
|
sub stringify_exit_status
|
|
|
|
{
|
|
|
|
my ($status) = @_;
|
|
|
|
my $string;
|
|
|
|
my $signal = $status & 127;
|
|
|
|
my $cored = $status & 128;
|
|
|
|
my $code = $status >> 8;
|
|
|
|
|
|
|
|
if ($signal) {
|
|
|
|
$string = "signal = $signal";
|
|
|
|
} else {
|
|
|
|
$string = "exited = $code";
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($cored) {
|
|
|
|
$string = $string . ", dumped core";
|
|
|
|
}
|
|
|
|
|
|
|
|
return $string;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub open_control_fifo
|
|
|
|
{
|
|
|
|
my ($svdir) = @_;
|
|
|
|
my $fifofile = "$svdir/.ctrl";
|
|
|
|
if (-e $fifofile) {
|
|
|
|
unlink $fifofile or die "Cannot remove fifo: $fifofile\n";
|
|
|
|
}
|
|
|
|
mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n";
|
|
|
|
sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n";
|
|
|
|
return $fifofh;
|
|
|
|
}
|
|
|
|
|
|
|
|
sub pretty
|
|
|
|
{
|
|
|
|
my ($text, $color) = @_;
|
|
|
|
if (-t STDERR) {
|
|
|
|
if ($color eq 'bold') {
|
|
|
|
return "\x1b[1m$text\x1b[0m";
|
|
|
|
} elsif ($color eq 'red') {
|
|
|
|
return "\x1b[31m\x1b[1m$text\x1b[0m";
|
|
|
|
} else {
|
|
|
|
return $text;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return $text;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
my @commands;
|
|
|
|
|
|
|
|
# If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal
|
|
|
|
my $killed = 0;
|
|
|
|
|
|
|
|
# If >0 then kill -9 all procs at this time
|
|
|
|
my $killkill = 0;
|
|
|
|
|
|
|
|
# Current proc order we're stopping. Ignored unless $killed is nonzero
|
|
|
|
my $stopping = 100;
|
|
|
|
|
|
|
|
# We'll do our own reaping
|
|
|
|
$SIG{CHLD} = sub {};
|
|
|
|
|
|
|
|
# Redirect stderr to stdout
|
|
|
|
open STDERR, ">&STDOUT" or die;
|
|
|
|
|
|
|
|
# Parse arguments
|
|
|
|
my %opt = (
|
|
|
|
'chdir' => realpath("$FindBin::Bin/.."),
|
|
|
|
'vardir' => realpath("$FindBin::Bin/../var"),
|
|
|
|
'kill-timeout' => 360,
|
|
|
|
);
|
|
|
|
|
|
|
|
usage() unless GetOptions(
|
|
|
|
\%opt,
|
|
|
|
'conf|c=s',
|
|
|
|
'vardir|d=s',
|
|
|
|
'kill-timeout|t=i',
|
|
|
|
'chdir=s',
|
|
|
|
'svlogd:s'
|
|
|
|
);
|
|
|
|
|
|
|
|
usage() unless $opt{'conf'} && $opt{'vardir'};
|
|
|
|
|
|
|
|
# Read config file
|
|
|
|
my $config = read_config_file($opt{'conf'});
|
|
|
|
@commands = @{$config->{commands}};
|
|
|
|
|
|
|
|
if (!@commands) {
|
|
|
|
die "Nothing to run.\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
# Potentially override --kill-timeout
|
|
|
|
if (defined $config->{'kill-timeout'}) {
|
|
|
|
$opt{'kill-timeout'} = $config->{'kill-timeout'};
|
|
|
|
}
|
|
|
|
|
2022-06-03 01:14:29 -04:00
|
|
|
# Remember where vardir, logdir, svdir are after chdiring
|
2018-08-09 16:37:52 -04:00
|
|
|
my $vardir = File::Spec->rel2abs($opt{vardir});
|
2022-06-03 01:14:29 -04:00
|
|
|
my $logdir = File::Spec->rel2abs(realpath($ENV{'DRUID_LOG_DIR'} || "$FindBin::Bin/../log"));
|
2018-08-09 16:37:52 -04:00
|
|
|
my $svdir = "$vardir/sv";
|
|
|
|
|
|
|
|
# chdir to the root of the distribution (or whereever)
|
|
|
|
chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n";
|
|
|
|
|
|
|
|
# Create vardir with tmp/
|
|
|
|
if (! -e "$vardir/tmp") {
|
|
|
|
system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
# Create svdir
|
|
|
|
if (! -e $svdir) {
|
|
|
|
system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n";
|
|
|
|
}
|
|
|
|
|
2022-06-03 01:14:29 -04:00
|
|
|
# Create logdir, if needed
|
|
|
|
if (!defined $opt{svlogd} && ! -e "$logdir") {
|
|
|
|
system("mkdir -p \Q$logdir\E") == 0 or die "mkdir $logdir failed: $!\n";
|
|
|
|
}
|
|
|
|
|
2018-08-09 16:37:52 -04:00
|
|
|
# Lock svdir and keep it locked until we exit
|
|
|
|
my $lockfile = "$svdir/.lock";
|
|
|
|
open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n";
|
|
|
|
flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n";
|
|
|
|
|
|
|
|
# Create control fifo in svdir
|
|
|
|
my $fifofh = open_control_fifo($svdir);
|
|
|
|
|
|
|
|
# Run verification commands
|
|
|
|
for my $verify_cmd (@{$config->{verify}}) {
|
|
|
|
system($verify_cmd) == 0 or exit 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Catch killy signals and do an orderly shutdown
|
|
|
|
$SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } };
|
|
|
|
$SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } };
|
|
|
|
$SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } };
|
|
|
|
|
|
|
|
# Build up control fifo command over multiple sysreads, potentially
|
|
|
|
my $fifobuffer = '';
|
|
|
|
|
2022-06-03 01:14:29 -04:00
|
|
|
if (defined $opt{svlogd}) {
|
|
|
|
logit "Staring services with log directory [svdir].";
|
|
|
|
} else {
|
|
|
|
logit "Starting services with log directory [$logdir].";
|
|
|
|
}
|
|
|
|
|
2018-08-09 16:37:52 -04:00
|
|
|
while (1) {
|
|
|
|
# Spawn new procs
|
|
|
|
if (!$killed) {
|
|
|
|
for my $command (grep { !$_->{pid} } @commands) {
|
|
|
|
if ($command->{down} < time) {
|
|
|
|
if (my $pid = fork) {
|
|
|
|
$command->{pid} = $pid;
|
|
|
|
} else {
|
|
|
|
setsid;
|
|
|
|
|
|
|
|
if (defined $opt{'svlogd'}) {
|
2022-06-03 01:14:29 -04:00
|
|
|
# If using svlogd, program output goes into the service directory. We do not use $logdir here.
|
|
|
|
my $logfile = "$svdir/$command->{name}";
|
|
|
|
logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}";
|
|
|
|
|
2018-08-09 16:37:52 -04:00
|
|
|
if (! -e $logfile) {
|
|
|
|
system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($opt{'svlogd'}) {
|
|
|
|
copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!";
|
|
|
|
} else {
|
|
|
|
open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n";
|
|
|
|
print $configfh "s100000000\nn10\nN5\nt604800";
|
|
|
|
close $configfh;
|
|
|
|
}
|
|
|
|
|
|
|
|
open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n";
|
|
|
|
} else {
|
2022-06-03 01:14:29 -04:00
|
|
|
# If not using svlogd, program output goes to $logdir. In the default configuration, this will be a small
|
|
|
|
# amount of logging from the JVM itself, because all of the Druid and ZooKeeper logs are written into
|
|
|
|
# separate files by log4j2.
|
|
|
|
logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}";
|
|
|
|
my $logfile = "$logdir/$command->{name}.stdout.log";
|
|
|
|
open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n";
|
2018-08-09 16:37:52 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n";
|
|
|
|
exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Reap dead procs
|
|
|
|
my $pid;
|
|
|
|
while (($pid = waitpid(-1, WNOHANG)) > 0) {
|
|
|
|
my $status = $?;
|
|
|
|
my ($command) = (grep { $_->{pid} eq $pid } @commands);
|
|
|
|
if ($command) {
|
|
|
|
$command->{pid} = 0;
|
|
|
|
$command->{down} = time + 2;
|
|
|
|
logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")";
|
|
|
|
if ($status && !$killed && !$command->{restarting}) {
|
|
|
|
# Unexpected exit
|
2021-12-03 08:32:01 -05:00
|
|
|
logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see its logfile for more details";
|
2018-08-09 16:37:52 -04:00
|
|
|
}
|
|
|
|
$command->{restarting} = 0;
|
|
|
|
} else {
|
|
|
|
logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Kill procs, maybe
|
|
|
|
if ($killed) {
|
|
|
|
my $should_killkill = time > $killkill;
|
|
|
|
|
|
|
|
# Update stopping position, maybe
|
|
|
|
if ($should_killkill) {
|
|
|
|
$stopping = 0;
|
|
|
|
} else {
|
|
|
|
my $maxorder = 0;
|
|
|
|
for my $command (grep { $_->{pid} } @commands) {
|
|
|
|
if ($command->{order} > $maxorder) {
|
|
|
|
$maxorder = $command->{order};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($maxorder < $stopping) {
|
|
|
|
$stopping = $maxorder;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) {
|
|
|
|
my $want_signal;
|
|
|
|
if ($command->{killed} == 9 || $should_killkill) {
|
|
|
|
$want_signal = 9;
|
|
|
|
} else {
|
|
|
|
$want_signal = 15;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ($command->{killed} != $want_signal) {
|
|
|
|
if ($want_signal != 9) {
|
|
|
|
my $kt = $opt{'kill-timeout'};
|
|
|
|
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s).";
|
|
|
|
} else {
|
|
|
|
logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "].";
|
|
|
|
}
|
|
|
|
kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}";
|
|
|
|
$command->{killed} = $want_signal;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Kill ourselves, maybe
|
|
|
|
if ($killed && ! grep { $_->{pid} } @commands) {
|
|
|
|
logit "Exiting.";
|
|
|
|
$SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT';
|
|
|
|
if ($killed > 0) {
|
|
|
|
kill $killed, $$;
|
|
|
|
exit 1;
|
|
|
|
} else {
|
|
|
|
# Normal exit
|
|
|
|
exit 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Be controlled, maybe
|
|
|
|
my $fifostr = "";
|
|
|
|
if (sysread $fifofh, $fifostr, 4096) {
|
|
|
|
$fifobuffer .= $fifostr;
|
|
|
|
|
|
|
|
while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) {
|
|
|
|
my $fifocmd = $1;
|
|
|
|
$fifobuffer = $2;
|
|
|
|
if ($fifocmd =~ /^k (.+)$/ && !$killed) {
|
|
|
|
my $name = $1;
|
|
|
|
my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands;
|
|
|
|
if ($command) {
|
|
|
|
logit "Restarting command[" . pretty($name, "bold") . "].";
|
|
|
|
if (kill TERM => $command->{pid}) {
|
|
|
|
$command->{restarting} = 1;
|
|
|
|
} else {
|
|
|
|
logit "WARN: Could not signal pid: $command->{pid}"
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring.";
|
|
|
|
}
|
|
|
|
} elsif ($fifocmd eq 'd') {
|
|
|
|
# -1 means exit without signal
|
|
|
|
$killed = -1;
|
|
|
|
$killkill = time + $opt{'kill-timeout'}
|
|
|
|
} else {
|
|
|
|
logit "Received unknown control command, ignoring.";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sleep 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
exit 0;
|