#!/usr/bin/env perl # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. use strict; use warnings; use Cwd qw/realpath/; use POSIX qw/:sys_wait_h mkfifo setsid/; use Fcntl qw/:DEFAULT :flock/; use Getopt::Long qw/:config require_order gnu_compat/; use FindBin; use File::Spec; use File::Copy; sub logdie($) { my ($msg) = @_; chomp $msg; die "[" . (scalar localtime()) . "] $msg\n"; } sub logit($) { my ($msg) = @_; chomp $msg; warn "[" . (scalar localtime()) . "] $msg\n"; } sub usage { die "usage: $0 -c [-d ] [-t ] [--svlogd ]\n"; } sub read_config_file { my ($config_file) = @_; open my $config_fh, "<", $config_file or die "open $config_file: $!"; my @commands; my @verify; my $kill_timeout; while (my $line = <$config_fh>) { chomp $line; next if $line =~ /^(\s*\#.*|\s*)$/; if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) { my $name = $1; my $order = 50; my $command = $2; if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) { $order = $1; $name = $2; } if ($name eq ':verify') { push @verify, $command; } elsif ($name eq ':kill-timeout') { $kill_timeout = int($command); } else { die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands; push @commands, { name => $name, command => $command, order => $order, # Stop order for this command pid => 0, # Current pid, or 0 if not running down => 0, # Time the proc should be down until killed => 0, # Signal we sent to this process restarting => 0, # True if this command is currently restarting }; } } else { die "Syntax error: $line\n"; } } close $config_fh; return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout }; } sub stringify_exit_status { my ($status) = @_; my $string; my $signal = $status & 127; my $cored = $status & 128; my $code = $status >> 8; if ($signal) { $string = "signal = $signal"; } else { $string = "exited = $code"; } if ($cored) { $string = $string . ", dumped core"; } return $string; } sub open_control_fifo { my ($svdir) = @_; my $fifofile = "$svdir/.ctrl"; if (-e $fifofile) { unlink $fifofile or die "Cannot remove fifo: $fifofile\n"; } mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n"; sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n"; return $fifofh; } sub pretty { my ($text, $color) = @_; if (-t STDERR) { if ($color eq 'bold') { return "\x1b[1m$text\x1b[0m"; } elsif ($color eq 'red') { return "\x1b[31m\x1b[1m$text\x1b[0m"; } else { return $text; } } else { return $text; } } my @commands; # If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal my $killed = 0; # If >0 then kill -9 all procs at this time my $killkill = 0; # Current proc order we're stopping. Ignored unless $killed is nonzero my $stopping = 100; # We'll do our own reaping $SIG{CHLD} = sub {}; # Redirect stderr to stdout open STDERR, ">&STDOUT" or die; # Parse arguments my %opt = ( 'chdir' => realpath("$FindBin::Bin/.."), 'vardir' => realpath("$FindBin::Bin/../var"), 'kill-timeout' => 360, ); usage() unless GetOptions( \%opt, 'conf|c=s', 'vardir|d=s', 'kill-timeout|t=i', 'chdir=s', 'svlogd:s' ); usage() unless $opt{'conf'} && $opt{'vardir'}; # Read config file my $config = read_config_file($opt{'conf'}); @commands = @{$config->{commands}}; if (!@commands) { die "Nothing to run.\n"; } # Potentially override --kill-timeout if (defined $config->{'kill-timeout'}) { $opt{'kill-timeout'} = $config->{'kill-timeout'}; } # Remember where vardir, logdir, svdir are after chdiring my $vardir = File::Spec->rel2abs($opt{vardir}); my $logdir = File::Spec->rel2abs(realpath($ENV{'DRUID_LOG_DIR'} || "$FindBin::Bin/../log")); my $svdir = "$vardir/sv"; # chdir to the root of the distribution (or whereever) chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n"; # Create vardir with tmp/ if (! -e "$vardir/tmp") { system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n"; } # Create svdir if (! -e $svdir) { system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n"; } # Create logdir, if needed if (!defined $opt{svlogd} && ! -e "$logdir") { system("mkdir -p \Q$logdir\E") == 0 or die "mkdir $logdir failed: $!\n"; } # Lock svdir and keep it locked until we exit my $lockfile = "$svdir/.lock"; open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n"; flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n"; # Create control fifo in svdir my $fifofh = open_control_fifo($svdir); # Run verification commands for my $verify_cmd (@{$config->{verify}}) { system($verify_cmd) == 0 or exit 1; } # Catch killy signals and do an orderly shutdown $SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } }; $SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } }; $SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } }; # Build up control fifo command over multiple sysreads, potentially my $fifobuffer = ''; if (defined $opt{svlogd}) { logit "Staring services with log directory [svdir]."; } else { logit "Starting services with log directory [$logdir]."; } while (1) { # Spawn new procs if (!$killed) { for my $command (grep { !$_->{pid} } @commands) { if ($command->{down} < time) { if (my $pid = fork) { $command->{pid} = $pid; } else { setsid; if (defined $opt{'svlogd'}) { # If using svlogd, program output goes into the service directory. We do not use $logdir here. my $logfile = "$svdir/$command->{name}"; logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}"; if (! -e $logfile) { system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n"; } if ($opt{'svlogd'}) { copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!"; } else { open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n"; print $configfh "s100000000\nn10\nN5\nt604800"; close $configfh; } open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n"; } else { # If not using svlogd, program output goes to $logdir. In the default configuration, this will be a small # amount of logging from the JVM itself, because all of the Druid and ZooKeeper logs are written into # separate files by log4j2. logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}"; my $logfile = "$logdir/$command->{name}.stdout.log"; open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n"; } open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n"; exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!"; } } } } # Reap dead procs my $pid; while (($pid = waitpid(-1, WNOHANG)) > 0) { my $status = $?; my ($command) = (grep { $_->{pid} eq $pid } @commands); if ($command) { $command->{pid} = 0; $command->{down} = time + 2; logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")"; if ($status && !$killed && !$command->{restarting}) { # Unexpected exit logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see its logfile for more details"; } $command->{restarting} = 0; } else { logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")"; } } # Kill procs, maybe if ($killed) { my $should_killkill = time > $killkill; # Update stopping position, maybe if ($should_killkill) { $stopping = 0; } else { my $maxorder = 0; for my $command (grep { $_->{pid} } @commands) { if ($command->{order} > $maxorder) { $maxorder = $command->{order}; } } if ($maxorder < $stopping) { $stopping = $maxorder; } } for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) { my $want_signal; if ($command->{killed} == 9 || $should_killkill) { $want_signal = 9; } else { $want_signal = 15; } if ($command->{killed} != $want_signal) { if ($want_signal != 9) { my $kt = $opt{'kill-timeout'}; logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s)."; } else { logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "]."; } kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}"; $command->{killed} = $want_signal; } } } # Kill ourselves, maybe if ($killed && ! grep { $_->{pid} } @commands) { logit "Exiting."; $SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT'; if ($killed > 0) { kill $killed, $$; exit 1; } else { # Normal exit exit 0; } } # Be controlled, maybe my $fifostr = ""; if (sysread $fifofh, $fifostr, 4096) { $fifobuffer .= $fifostr; while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) { my $fifocmd = $1; $fifobuffer = $2; if ($fifocmd =~ /^k (.+)$/ && !$killed) { my $name = $1; my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands; if ($command) { logit "Restarting command[" . pretty($name, "bold") . "]."; if (kill TERM => $command->{pid}) { $command->{restarting} = 1; } else { logit "WARN: Could not signal pid: $command->{pid}" } } else { logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring."; } } elsif ($fifocmd eq 'd') { # -1 means exit without signal $killed = -1; $killkill = time + $opt{'kill-timeout'} } else { logit "Received unknown control command, ignoring."; } } } sleep 1; } exit 0;