#!/usr/bin/env perl

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

use strict;
use warnings;
use Cwd qw/realpath/;
use POSIX qw/:sys_wait_h mkfifo setsid/;
use Fcntl qw/:DEFAULT :flock/;
use Getopt::Long qw/:config require_order gnu_compat/;
use FindBin;
use File::Spec;
use File::Copy;

sub logdie($)
{
  my ($msg) = @_;
  chomp $msg;
  die "[" . (scalar localtime()) . "] $msg\n";
}

sub logit($)
{
  my ($msg) = @_;
  chomp $msg;
  warn "[" . (scalar localtime()) . "] $msg\n";
}

sub usage
{
  die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--svlogd <optional conf file>]\n";
}

sub process_config
{
    my @lines = @_;
    my @commands;
    my @verify;
    my @notify;
    my $kill_timeout;
    for my $line (@lines)
    {
        if ($line =~ /^(:verify|:notify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
          my $name = $1;
          my $order = 50;
          my $command = $2;

          if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) {
            $order = $1;
            $name = $2;
          }

          if ($name eq ':verify') {
            push @verify, $command;
          } elsif ($name eq ':notify') {
            push @notify, $command;
          } elsif ($name eq ':kill-timeout') {
            $kill_timeout = int($command);
          } else {
            die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands;
            push @commands, {
              name => $name,
              command => $command,
              order => $order,  # Stop order for this command
              pid => 0,         # Current pid, or 0 if not running
              down => 0,        # Time the proc should be down until
              killed => 0,      # Signal we sent to this process
              restarting => 0,  # True if this command is currently restarting
            };
          }
        } else {
          die "Syntax error: $line\n";
        }
    }

    return { commands => \@commands, verify => \@verify, notify => \@notify, 'kill-timeout' => $kill_timeout };
}

sub read_config_file
{
  my ($config_file) = @_;

  open my $config_fh, "<", $config_file
    or die "open $config_file: $!";

  my @lines;
  while (my $line = <$config_fh>) {
    chomp $line;
    next if $line =~ /^(\s*\#.*|\s*)$/;

    if ($line =~ /^(:verify|:notify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
        push @lines, $line
    } else {
      die "Syntax error: $line\n";
    }
  }

  close $config_fh;
  return @lines;
}

sub stringify_exit_status
{
  my ($status) = @_;
  my $string;
  my $signal = $status & 127;
  my $cored = $status & 128;
  my $code = $status >> 8;

  if ($signal) {
    $string = "signal = $signal";
  } else {
    $string = "exited = $code";
  }

  if ($cored) {
    $string = $string . ", dumped core";
  }

  return $string;
}

sub open_control_fifo
{
  my ($svdir) = @_;
  my $fifofile = "$svdir/.ctrl";
  if (-e $fifofile) {
    unlink $fifofile or die "Cannot remove fifo: $fifofile\n";
  }
  mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n";
  sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n";
  return $fifofh;
}

sub pretty
{
  my ($text, $color) = @_;
  if (-t STDERR) {
    if ($color eq 'bold') {
      return "\x1b[1m$text\x1b[0m";
    } elsif ($color eq 'red') {
      return "\x1b[31m\x1b[1m$text\x1b[0m";
    } elsif ($color eq 'green') {
      return "\x1b[32m\x1b[1m$text\x1b[0m";
    } else {
      return $text;
    }
  } else {
    return $text;
  }
}

my @commands;

# If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal
my $killed = 0;

# If >0 then kill -9 all procs at this time
my $killkill = 0;

# Current proc order we're stopping. Ignored unless $killed is nonzero
my $stopping = 100;

# We'll do our own reaping
$SIG{CHLD} = sub {};

# Redirect stderr to stdout
open STDERR, ">&STDOUT" or die;

# Parse arguments
my %opt = (
  'chdir' => realpath("$FindBin::Bin/.."),
  'vardir' => realpath("$FindBin::Bin/../var"),
  'kill-timeout' => 360,
);

usage() unless GetOptions(
  \%opt,
  'conf|c=s',
  'vardir|d=s',
  'kill-timeout|t=i',
  'chdir=s',
  'svlogd:s',
  'command=s@'
);

usage() unless (($opt{'command'} && @{$opt{'command'}}) || $opt{'conf'}) && $opt{'vardir'};

my @config_lines;

# get commands to execute either from reading the config file or command line
if (not defined $opt{'conf'}) {
    @config_lines = @{$opt{'command'}}
} else {
    @config_lines = read_config_file($opt{'conf'});
}

my $config = process_config(@config_lines);

@commands = @{$config->{commands}};

if (!@commands) {
  die "Nothing to run.\n";
}

# Potentially override --kill-timeout
if (defined $config->{'kill-timeout'}) {
  $opt{'kill-timeout'} = $config->{'kill-timeout'};
}

# Remember where vardir, logdir, svdir are after chdiring
my $vardir = File::Spec->rel2abs($opt{vardir});
my $logdir = File::Spec->rel2abs(realpath($ENV{'DRUID_LOG_DIR'} || "$FindBin::Bin/../log"));
my $svdir = "$vardir/sv";

# chdir to the root of the distribution (or whereever)
chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n";

# Create vardir with tmp/
if (! -e "$vardir/tmp") {
  system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n";
}

# Create svdir
if (! -e $svdir) {
  system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n";
}

# Create logdir, if needed
if (!defined $opt{svlogd} && ! -e "$logdir") {
  system("mkdir -p \Q$logdir\E") == 0 or die "mkdir $logdir failed: $!\n";
}

# Lock svdir and keep it locked until we exit
my $lockfile = "$svdir/.lock";
open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n";
flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n";

# Create control fifo in svdir
my $fifofh = open_control_fifo($svdir);

# Run verification commands
for my $verify_cmd (@{$config->{verify}}) {
  system($verify_cmd) == 0 or exit 1;
}

# Catch killy signals and do an orderly shutdown
$SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } };

# Build up control fifo command over multiple sysreads, potentially
my $fifobuffer = '';

# Run notification commands and print their output
for my $notify_cmd (@{$config->{notify}}) {
  my $notify_output = qx[$notify_cmd 2>&1];
  if (!$?) {
    for my $notify_line (split /\n/, $notify_output) {
      logit pretty($notify_line, 'green');
    }
  }
}

if (defined $opt{svlogd}) {
  logit "Starting services with log directory [$svdir].";
} else {
  logit "Starting services with log directory [$logdir].";
}

while (1) {
  # Spawn new procs
  if (!$killed) {
    for my $command (grep { !$_->{pid} } @commands) {
      if ($command->{down} < time) {
        if (my $pid = fork) {
          $command->{pid} = $pid;
        } else {
          setsid;

          if (defined $opt{'svlogd'}) {
            # If using svlogd, program output goes into the service directory. We do not use $logdir here.
            my $logfile = "$svdir/$command->{name}";
            logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}";

            if (! -e $logfile) {
              system("mkdir -p \Q$logfile\E") == 0 or logdie "mkdir $logfile failed: $!\n";
            }

            if ($opt{'svlogd'}) {
              copy($opt{'svlogd'}, "$logfile/config") or logdie "Failed copying $opt{'svlogd'} to $logfile/config: $!";
            } else {
              open my $configfh, ">", "$logfile/config" or logdie "Cannot write svlogd config, please check permissions: $logfile/config\n";
              print $configfh "s100000000\nn10\nN5\nt604800";
              close $configfh;
            }

            open STDOUT, "|svlogd $logfile" or logdie "pipe to svlogd $logfile failed: $!\n";
          } else {
            # If not using svlogd, program output goes to $logdir. In the default configuration, this will be a small
            # amount of logging from the JVM itself, because all of the Druid and ZooKeeper logs are written into
            # separate files by log4j2.
            logit "Running command[" . pretty($command->{name}, 'bold') . "]: $command->{command}";
            my $logfile = "$logdir/$command->{name}.stdout.log";
            open STDOUT, ">>", $logfile or logdie "open $logfile failed: $!\n";
          }

          open STDERR, ">&STDOUT" or logdie "redirecting stderr failed: $!\n";
          exec('sh', '-c', "exec $command->{command}") or logdie "exec [$command->{command}] failed: $!";
        }
      }
    }
  }

  # Reap dead procs
  my $pid;
  while (($pid = waitpid(-1, WNOHANG)) > 0) {
    my $status = $?;
    my ($command) = (grep { $_->{pid} eq $pid } @commands);
    if ($command) {
      $command->{pid} = 0;
      $command->{down} = time + 2;
      logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")";
      if ($status && !$killed && !$command->{restarting}) {
        # Unexpected exit
        logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see its logfile for more details";
      }
      $command->{restarting} = 0;
    } else {
      logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")";
    }
  }

  # Kill procs, maybe
  if ($killed) {
    my $should_killkill = time > $killkill;

    # Update stopping position, maybe
    if ($should_killkill) {
      $stopping = 0;
    } else {
      my $maxorder = 0;
      for my $command (grep { $_->{pid} } @commands) {
        if ($command->{order} > $maxorder) {
          $maxorder = $command->{order};
        }
      }

      if ($maxorder < $stopping) {
        $stopping = $maxorder;
      }
    }

    for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) {
      my $want_signal;
      if ($command->{killed} == 9 || $should_killkill) {
        $want_signal = 9;
      } else {
        $want_signal = 15;
      }

      if ($command->{killed} != $want_signal) {
        if ($want_signal != 9) {
          my $kt = $opt{'kill-timeout'};
          logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s).";
        } else {
          logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "].";
        }
        kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}";
        $command->{killed} = $want_signal;
      }
    }
  }

  # Kill ourselves, maybe
  if ($killed && ! grep { $_->{pid} } @commands) {
    logit "Exiting.";
    $SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT';
    if ($killed > 0) {
      kill $killed, $$;
      exit 1;
    } else {
      # Normal exit
      exit 0;
    }
  }

  # Be controlled, maybe
  my $fifostr = "";
  if (sysread $fifofh, $fifostr, 4096) {
    $fifobuffer .= $fifostr;

    while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) {
      my $fifocmd = $1;
      $fifobuffer = $2;
      if ($fifocmd =~ /^k (.+)$/ && !$killed) {
        my $name = $1;
        my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands;
        if ($command) {
          logit "Restarting command[" . pretty($name, "bold") . "].";
          if (kill TERM => $command->{pid}) {
            $command->{restarting} = 1;
          } else {
            logit "WARN: Could not signal pid: $command->{pid}"
          }
        } else {
          logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring.";
        }
      } elsif ($fifocmd eq 'd') {
        # -1 means exit without signal
        $killed = -1;
        $killkill = time + $opt{'kill-timeout'}
      } else {
        logit "Received unknown control command, ignoring.";
      }
    }
  }

  sleep 1;
}

exit 0;