2018-11-22 02:46:39 -05:00
|
|
|
#!/usr/bin/env ruby
|
2018-11-22 22:43:39 -05:00
|
|
|
# frozen_string_literal: true
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
# Specifying this env var ensures ruby can load gems installed via the Discourse
|
|
|
|
# project Gemfile (e.g. pg, redis).
|
|
|
|
ENV['BUNDLE_GEMFILE'] ||= '/var/www/discourse/Gemfile'
|
|
|
|
require 'bundler/setup'
|
|
|
|
|
2019-12-27 12:39:08 -05:00
|
|
|
require 'ipaddr'
|
2022-04-05 19:44:34 -04:00
|
|
|
require 'pg'
|
|
|
|
require 'redis'
|
2018-11-22 02:46:39 -05:00
|
|
|
require 'resolv'
|
|
|
|
require 'socket'
|
2022-04-05 19:44:34 -04:00
|
|
|
require 'time'
|
2018-11-22 02:46:39 -05:00
|
|
|
|
|
|
|
CRITICAL_HOST_ENV_VARS = %w{
|
|
|
|
DISCOURSE_DB_HOST
|
2019-04-29 20:42:51 -04:00
|
|
|
DISCOURSE_DB_REPLICA_HOST
|
2018-11-22 02:46:39 -05:00
|
|
|
DISCOURSE_REDIS_HOST
|
|
|
|
DISCOURSE_REDIS_SLAVE_HOST
|
2022-02-09 09:41:26 -05:00
|
|
|
DISCOURSE_REDIS_REPLICA_HOST
|
2018-11-22 02:46:39 -05:00
|
|
|
}
|
2022-04-05 19:44:34 -04:00
|
|
|
HOST_RESOLVER_CACHE = {}
|
|
|
|
HOST_HEALTHY_CACHE = {}
|
|
|
|
HOSTS_PATH = ENV['DISCOURSE_DNS_CACHE_HOSTS_FILE'] || "/etc/hosts"
|
2022-05-08 21:34:04 -04:00
|
|
|
|
|
|
|
PrioFilter = Struct.new(:min, :max) do
|
|
|
|
# min and max must be integers and relate to the minimum or maximum accepted
|
|
|
|
# priority of an SRV RR target.
|
|
|
|
# The return value from within_threshold? indicates if the priority is less
|
|
|
|
# than or equal to the upper threshold, or greater than or equal to the
|
|
|
|
# lower threshold.
|
|
|
|
def within_threshold?(p)
|
|
|
|
p >= min && p <= max
|
|
|
|
end
|
|
|
|
end
|
|
|
|
SRV_PRIORITY_THRESHOLD_MIN = 0
|
|
|
|
SRV_PRIORITY_THRESHOLD_MAX = 65535
|
|
|
|
SRV_PRIORITY_FILTERS = Hash.new(
|
|
|
|
PrioFilter.new(SRV_PRIORITY_THRESHOLD_MIN, SRV_PRIORITY_THRESHOLD_MAX))
|
|
|
|
|
2022-04-27 00:42:35 -04:00
|
|
|
REFRESH_SECONDS = 30
|
2022-04-05 19:44:34 -04:00
|
|
|
|
|
|
|
module DNSClient
|
|
|
|
def dns_client_with_timeout
|
|
|
|
Resolv::DNS.open do |dns_client|
|
|
|
|
dns_client.timeouts = 2
|
|
|
|
yield dns_client
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
class Name
|
|
|
|
include DNSClient
|
|
|
|
|
|
|
|
def initialize(hostname)
|
|
|
|
@name = hostname
|
|
|
|
end
|
|
|
|
|
|
|
|
def resolve
|
|
|
|
dns_client_with_timeout do |dns_client|
|
|
|
|
[].tap do |addresses|
|
|
|
|
addresses.concat(dns_client.getresources(@name, Resolv::DNS::Resource::IN::A).map(&:address))
|
|
|
|
addresses.concat(dns_client.getresources(@name, Resolv::DNS::Resource::IN::AAAA).map(&:address))
|
|
|
|
end.map(&:to_s)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
class SRVName
|
|
|
|
include DNSClient
|
|
|
|
|
2022-05-08 21:34:04 -04:00
|
|
|
def initialize(srv_hostname, prio_filter)
|
2022-04-05 19:44:34 -04:00
|
|
|
@name = srv_hostname
|
2022-05-08 21:34:04 -04:00
|
|
|
@prio_filter = prio_filter
|
2022-04-05 19:44:34 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def resolve
|
|
|
|
dns_client_with_timeout do |dns_client|
|
|
|
|
[].tap do |addresses|
|
|
|
|
targets = dns_client.getresources(@name, Resolv::DNS::Resource::IN::SRV)
|
2022-05-08 21:34:04 -04:00
|
|
|
targets.delete_if { |t| !@prio_filter.within_threshold?(t.priority) }
|
2022-04-05 19:44:34 -04:00
|
|
|
addresses.concat(targets.map { |t| Name.new(t.target.to_s).resolve }.flatten)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
CacheMeta = Struct.new(:first_seen, :last_seen)
|
|
|
|
|
|
|
|
class ResolverCache
|
|
|
|
def initialize(name)
|
|
|
|
# instance of Name|SRVName
|
|
|
|
@name = name
|
|
|
|
|
|
|
|
# {IPv4/IPv6 address: CacheMeta}
|
|
|
|
@cached = {}
|
|
|
|
end
|
|
|
|
|
|
|
|
# resolve returns a list of resolved addresses ordered by the time first seen,
|
|
|
|
# most recently seen at the head of the list.
|
|
|
|
# Addresses last seen more than 30 minutes ago are evicted from the cache on
|
|
|
|
# a call to resolve().
|
|
|
|
# If an exception occurs during DNS resolution we return whatever addresses are
|
|
|
|
# cached.
|
|
|
|
def resolve
|
|
|
|
@name.resolve.each do |address|
|
|
|
|
if @cached[address]
|
|
|
|
@cached[address].last_seen = Time.now.utc
|
|
|
|
else
|
|
|
|
@cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
ensure
|
|
|
|
@cached = @cached.delete_if { |_, meta| Time.now.utc - 30 * 60 > meta.last_seen }
|
|
|
|
@cached.sort_by { |_, meta| meta.first_seen }.reverse.map(&:first)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
class HealthyCache
|
|
|
|
def initialize(resolver_cache, check)
|
|
|
|
@resolver_cache = resolver_cache # instance of ResolverCache
|
|
|
|
@check = check # lambda function to perform for health checks
|
|
|
|
@cached = nil # a single IP address that was most recently found to be healthy
|
|
|
|
end
|
|
|
|
|
|
|
|
def first_healthy
|
|
|
|
address = @resolver_cache.resolve.lazy.select { |addr| @check.call(addr) }.first
|
|
|
|
if !nilempty(address).nil?
|
|
|
|
@cached = address
|
|
|
|
end
|
|
|
|
@cached
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def redis_healthcheck(host:, password:)
|
2022-04-26 22:02:26 -04:00
|
|
|
client_opts = {
|
2022-04-05 19:44:34 -04:00
|
|
|
host: host,
|
|
|
|
password: password,
|
|
|
|
timeout: 1,
|
2022-04-26 22:02:26 -04:00
|
|
|
}
|
|
|
|
if !nilempty(ENV['DISCOURSE_REDIS_USE_SSL']).nil? then
|
|
|
|
client_opts[:ssl] = true
|
|
|
|
client_opts[:ssl_params] = {
|
|
|
|
verify_mode: OpenSSL::SSL::VERIFY_NONE,
|
|
|
|
}
|
|
|
|
end
|
|
|
|
client = Redis.new(**client_opts)
|
2022-04-05 19:44:34 -04:00
|
|
|
response = client.ping
|
|
|
|
response == "PONG"
|
|
|
|
rescue
|
|
|
|
false
|
|
|
|
ensure
|
|
|
|
client.close if client
|
|
|
|
end
|
|
|
|
|
|
|
|
def postgres_healthcheck(host:, user:, password:, dbname:)
|
|
|
|
response = PG::Connection.ping(
|
|
|
|
host: host,
|
|
|
|
user: user,
|
|
|
|
password: password,
|
|
|
|
dbname: dbname,
|
|
|
|
connect_timeout: 2, # minimum
|
|
|
|
)
|
|
|
|
response == PG::Constants::PQPING_OK
|
|
|
|
rescue
|
|
|
|
false
|
|
|
|
end
|
|
|
|
|
|
|
|
HEALTH_CHECKS = {
|
|
|
|
"DISCOURSE_DB_HOST": lambda { |addr|
|
|
|
|
postgres_healthcheck(
|
|
|
|
host: addr,
|
|
|
|
user: ENV["DISCOURSE_DB_USER_NAME"],
|
|
|
|
password: ENV["DISCOURSE_DB_PASSWORD"],
|
|
|
|
dbname: ENV["DISCOURSE_DB_NAME"])},
|
|
|
|
"DISCOURSE_DB_REPLICA_HOST": lambda { |addr|
|
|
|
|
postgres_healthcheck(
|
|
|
|
host: addr,
|
|
|
|
user: ENV["DISCOURSE_DB_USER_NAME"],
|
|
|
|
password: ENV["DISCOURSE_DB_PASSWORD"],
|
|
|
|
dbname: ENV["DISCOURSE_DB_NAME"])},
|
|
|
|
"DISCOURSE_REDIS_HOST": lambda { |addr|
|
|
|
|
redis_healthcheck(
|
|
|
|
host: addr,
|
|
|
|
password: ENV["DISCOURSE_REDIS_PASSWORD"])},
|
|
|
|
"DISCOURSE_REDIS_REPLICA_HOST": lambda { |addr|
|
|
|
|
redis_healthcheck(
|
|
|
|
host: addr,
|
|
|
|
password: ENV["DISCOURSE_REDIS_PASSWORD"])},
|
|
|
|
}
|
2018-11-22 02:46:39 -05:00
|
|
|
|
|
|
|
def log(msg)
|
2022-04-05 19:44:34 -04:00
|
|
|
STDERR.puts "#{Time.now.utc.iso8601}: #{msg}"
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def error(msg)
|
|
|
|
log(msg)
|
|
|
|
end
|
|
|
|
|
|
|
|
def swap_address(hosts, name, ips)
|
|
|
|
new_file = []
|
|
|
|
|
|
|
|
hosts.split("\n").each do |line|
|
2018-11-22 22:43:39 -05:00
|
|
|
line.strip!
|
2018-11-22 02:46:39 -05:00
|
|
|
if line[0] != '#'
|
2018-11-22 22:48:02 -05:00
|
|
|
_, hostname = line.split(/\s+/)
|
2018-11-22 02:46:39 -05:00
|
|
|
next if hostname == name
|
|
|
|
end
|
|
|
|
new_file << line
|
2018-11-27 23:18:08 -05:00
|
|
|
new_file << "\n"
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
ips.each do |ip|
|
2022-04-05 19:44:34 -04:00
|
|
|
new_file << "#{ip} #{name} # AUTO GENERATED: #{Time.now.utc.iso8601}\n"
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
|
|
|
|
2018-11-22 23:43:27 -05:00
|
|
|
new_file.join
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
|
|
|
|
|
|
|
def send_counter(name, description, labels, value)
|
|
|
|
host = "localhost"
|
2020-08-17 15:48:05 -04:00
|
|
|
port = ENV.fetch("DISCOURSE_PROMETHEUS_COLLECTOR_PORT", 9405).to_i
|
2018-11-22 02:46:39 -05:00
|
|
|
|
|
|
|
if labels
|
|
|
|
labels = labels.map do |k, v|
|
|
|
|
"\"#{k}\": \"#{v}\""
|
|
|
|
end.join(",")
|
|
|
|
else
|
|
|
|
labels = ""
|
|
|
|
end
|
|
|
|
|
|
|
|
json = <<~JSON
|
|
|
|
{
|
|
|
|
"_type": "Custom",
|
|
|
|
"type": "Counter",
|
|
|
|
"name": "#{name}",
|
|
|
|
"description": "#{description}",
|
|
|
|
"labels": { #{labels} },
|
|
|
|
"value": #{value}
|
|
|
|
}
|
|
|
|
JSON
|
|
|
|
|
|
|
|
payload = +"POST /send-metrics HTTP/1.1\r\n"
|
|
|
|
payload << "Host: #{host}\r\n"
|
|
|
|
payload << "Connection: Close\r\n"
|
|
|
|
payload << "Content-Type: application/json\r\n"
|
|
|
|
payload << "Content-Length: #{json.bytesize}\r\n"
|
|
|
|
payload << "\r\n"
|
|
|
|
payload << json
|
|
|
|
|
|
|
|
socket = TCPSocket.new host, port
|
|
|
|
socket.write payload
|
|
|
|
socket.flush
|
|
|
|
result = socket.read
|
|
|
|
first_line = result.split("\n")[0]
|
|
|
|
if first_line.strip != "HTTP/1.1 200 OK"
|
|
|
|
error("Failed to report metric #{result}")
|
|
|
|
end
|
|
|
|
socket.close
|
|
|
|
rescue => e
|
|
|
|
error("Failed to send metric to Prometheus #{e}")
|
|
|
|
end
|
|
|
|
|
|
|
|
def report_success
|
|
|
|
send_counter('critical_dns_successes_total', 'critical DNS resolution success', nil, 1)
|
|
|
|
end
|
|
|
|
|
|
|
|
def report_failure(errors)
|
|
|
|
errors.each do |host, count|
|
|
|
|
send_counter('critical_dns_failures_total', 'critical DNS resolution failures', host ? { host: host } : nil, count)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
def nilempty(v)
|
|
|
|
if v.nil?
|
2018-11-22 02:46:39 -05:00
|
|
|
nil
|
2022-04-05 19:44:34 -04:00
|
|
|
elsif v.respond_to?(:empty?) && v.empty?
|
|
|
|
nil
|
|
|
|
else
|
|
|
|
v
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
2022-04-05 19:44:34 -04:00
|
|
|
end
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-05-08 21:34:04 -04:00
|
|
|
def env_srv_var(env_name)
|
|
|
|
"#{env_name}_SRV"
|
|
|
|
end
|
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
def env_srv_name(env_name)
|
2022-05-08 21:34:04 -04:00
|
|
|
nilempty(ENV[env_srv_var(env_name)])
|
2022-04-05 19:44:34 -04:00
|
|
|
end
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
def run(hostname_vars)
|
|
|
|
# HOSTNAME: [IP_ADDRESS, ...]
|
|
|
|
# this will usually be a single address
|
|
|
|
resolved = {}
|
|
|
|
errors = Hash.new(0)
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
hostname_vars.each do |var|
|
|
|
|
name = ENV[var]
|
|
|
|
HOST_RESOLVER_CACHE[var] ||= ResolverCache.new(
|
|
|
|
if (srv_name = env_srv_name(var))
|
2022-05-08 21:34:04 -04:00
|
|
|
SRVName.new(srv_name, SRV_PRIORITY_FILTERS[env_srv_var(var)])
|
2018-11-22 02:46:39 -05:00
|
|
|
else
|
2022-04-05 19:44:34 -04:00
|
|
|
Name.new(name)
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
2022-04-05 19:44:34 -04:00
|
|
|
)
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
HOST_HEALTHY_CACHE[var] ||= HealthyCache.new(HOST_RESOLVER_CACHE[var], HEALTH_CHECKS[var.to_sym])
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
begin
|
|
|
|
if (address = HOST_HEALTHY_CACHE[var].first_healthy)
|
|
|
|
resolved[name] = [address]
|
|
|
|
else
|
|
|
|
error("#{var}: #{name}: no address")
|
|
|
|
errors[name] += 1
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
2022-04-05 19:44:34 -04:00
|
|
|
rescue => e
|
|
|
|
error("#{var}: #{name}: #{e}")
|
|
|
|
errors[name] += 1
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
2022-04-05 19:44:34 -04:00
|
|
|
end
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
hosts_content = File.read(HOSTS_PATH)
|
|
|
|
hosts = Resolv::Hosts.new(HOSTS_PATH)
|
|
|
|
|
|
|
|
changed = false
|
|
|
|
resolved.each do |hostname, ips|
|
|
|
|
if hosts.getaddresses(hostname).map(&:to_s).sort != ips.sort
|
|
|
|
log("IP addresses for #{hostname} changed to #{ips}")
|
|
|
|
hosts_content = swap_address(hosts_content, hostname, ips)
|
|
|
|
changed = true
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
2022-04-05 19:44:34 -04:00
|
|
|
end
|
2018-11-22 02:46:39 -05:00
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
if changed
|
|
|
|
File.write(HOSTS_PATH, hosts_content)
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|
2022-04-05 19:44:34 -04:00
|
|
|
|
2018-11-22 02:46:39 -05:00
|
|
|
rescue => e
|
2022-04-05 19:44:34 -04:00
|
|
|
error("DNS lookup failed: #{e}")
|
2018-11-22 02:46:39 -05:00
|
|
|
errors[nil] = 1
|
|
|
|
ensure
|
|
|
|
if errors == {}
|
|
|
|
report_success
|
|
|
|
else
|
|
|
|
report_failure(errors)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-04-05 19:44:34 -04:00
|
|
|
# If any of the host variables are an explicit IP we will not attempt to cache
|
|
|
|
# them.
|
|
|
|
all_hostname_vars = CRITICAL_HOST_ENV_VARS.select do |name|
|
|
|
|
begin
|
|
|
|
host = ENV[name]
|
|
|
|
next if nilempty(host).nil?
|
|
|
|
IPAddr.new(host)
|
|
|
|
false
|
|
|
|
rescue IPAddr::InvalidAddressError, IPAddr::AddressFamilyError
|
|
|
|
true
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2022-05-08 21:34:04 -04:00
|
|
|
# Populate the SRV_PRIORITY_FILTERS for any name that has a priority present in
|
|
|
|
# the environment. If no priority thresholds are found for the name, the default
|
|
|
|
# is that no filtering based on priority will be performed.
|
|
|
|
CRITICAL_HOST_ENV_VARS.each do |v|
|
|
|
|
if (name = env_srv_name(v))
|
|
|
|
max = ENV.fetch("#{env_srv_var(v)}_PRIORITY_LE", SRV_PRIORITY_THRESHOLD_MAX).to_i
|
|
|
|
min = ENV.fetch("#{env_srv_var(v)}_PRIORITY_GE", SRV_PRIORITY_THRESHOLD_MIN).to_i
|
|
|
|
if max > SRV_PRIORITY_THRESHOLD_MAX ||
|
|
|
|
min < SRV_PRIORITY_THRESHOLD_MIN ||
|
|
|
|
min > max
|
|
|
|
raise "invalid priority threshold set for #{v}"
|
|
|
|
end
|
|
|
|
|
|
|
|
SRV_PRIORITY_FILTERS[env_srv_var(v)] = PrioFilter.new(min, max)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2018-11-22 02:46:39 -05:00
|
|
|
while true
|
2022-04-05 19:44:34 -04:00
|
|
|
run(all_hostname_vars)
|
2018-11-22 22:43:39 -05:00
|
|
|
sleep REFRESH_SECONDS
|
2018-11-22 02:46:39 -05:00
|
|
|
end
|