DEV(cache_critical_dns): improve error reporting for failures
There are two failure modes that can be expected - no target SRV DNS RRs found or no healthy service available at target addresses. Prior to this patch, there was no way to differentiate from log messages between the two cases. Introduce an EmptyCache exception which may be raised by either the ResolverCache or HealthyCache. The exception message contains enough information about where the exception occurred to troubleshoot issues. An existing bug was fixed in this commit. Previously if a target address changed during runtime, an old cached (healthy) address would be returned.. The behaviour has been corrected to return the newly cached address.
This commit is contained in:
parent
5ea89d1fcb
commit
f38779adf4
|
@ -177,6 +177,8 @@ end
|
|||
|
||||
CacheMeta = Struct.new(:first_seen, :last_seen)
|
||||
|
||||
class EmptyCacheError < StandardError; end
|
||||
|
||||
class ResolverCache
|
||||
def initialize(name)
|
||||
# instance of Name|SRVName
|
||||
|
@ -186,22 +188,29 @@ class ResolverCache
|
|||
@cached = {}
|
||||
end
|
||||
|
||||
# resolve returns a list of resolved addresses ordered by the time first seen,
|
||||
# most recently seen at the head of the list.
|
||||
# Addresses last seen more than 30 minutes ago are evicted from the cache on
|
||||
# a call to resolve().
|
||||
# If an exception occurs during DNS resolution we return whatever addresses are
|
||||
# cached.
|
||||
# Returns a list of resolved addresses ordered by first seen time. Most
|
||||
# recently seen address is first.
|
||||
# If an exception occurs during DNS resolution we return whatever addresses
|
||||
# are cached.
|
||||
# Addresses last seen more than 30 minutes ago are evicted from the cache.
|
||||
# Raises EmptyCacheError if the cache is empty after DNS resolution and cache
|
||||
# eviction is performed.
|
||||
def resolve
|
||||
@name.resolve.each do |address|
|
||||
if @cached[address]
|
||||
@cached[address].last_seen = Time.now.utc
|
||||
else
|
||||
@cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc)
|
||||
begin
|
||||
@name.resolve.each do |address|
|
||||
if @cached[address]
|
||||
@cached[address].last_seen = Time.now.utc
|
||||
else
|
||||
@cached[address] = CacheMeta.new(Time.now.utc, Time.now.utc)
|
||||
end
|
||||
end
|
||||
rescue Resolv::ResolvError, Resolv::ResolvTimeout
|
||||
end
|
||||
ensure
|
||||
|
||||
@cached = @cached.delete_if { |_, meta| Time.now.utc - 30 * 60 > meta.last_seen }
|
||||
if @cached.empty?
|
||||
raise EmptyCacheError, "DNS resolver found no usable addresses"
|
||||
end
|
||||
@cached.sort_by { |_, meta| meta.first_seen }.reverse.map(&:first)
|
||||
end
|
||||
end
|
||||
|
@ -213,11 +222,25 @@ class HealthyCache
|
|||
@cached = nil # a single IP address that was most recently found to be healthy
|
||||
end
|
||||
|
||||
# Returns the first healthy server found in the list of resolved addresses.
|
||||
# Returns the last known healthy server if all servers disappear from the
|
||||
# DNS.
|
||||
# Raises EmptyCacheError if no healthy servers have been cached.
|
||||
def first_healthy
|
||||
address = @resolver_cache.resolve.lazy.select { |addr| @check.call(addr) }.first
|
||||
if !nilempty(address).nil?
|
||||
begin
|
||||
addresses = @resolver_cache.resolve
|
||||
rescue EmptyCacheError
|
||||
return @cached if @cached
|
||||
raise
|
||||
end
|
||||
|
||||
if (address = addresses.lazy.select { |addr| @check.call(addr) }.first)
|
||||
@cached = address
|
||||
end
|
||||
if @cached.nil?
|
||||
raise EmptyCacheError, "no healthy servers found amongst #{addresses}"
|
||||
end
|
||||
|
||||
@cached
|
||||
end
|
||||
end
|
||||
|
@ -413,8 +436,6 @@ def run_and_report(hostname_vars)
|
|||
end
|
||||
|
||||
def run(hostname_vars)
|
||||
# HOSTNAME: [IP_ADDRESS, ...]
|
||||
# this will usually be a single address
|
||||
resolved = {}
|
||||
errors = Hash.new(0)
|
||||
|
||||
|
@ -431,13 +452,9 @@ def run(hostname_vars)
|
|||
HOST_HEALTHY_CACHE[var] ||= HealthyCache.new(HOST_RESOLVER_CACHE[var], HEALTH_CHECKS[var.to_sym])
|
||||
|
||||
begin
|
||||
if (address = HOST_HEALTHY_CACHE[var].first_healthy)
|
||||
resolved[name] = [address]
|
||||
else
|
||||
error("#{var}: #{name}: no address")
|
||||
errors[name] += 1
|
||||
end
|
||||
rescue => e
|
||||
address = HOST_HEALTHY_CACHE[var].first_healthy
|
||||
resolved[name] = [address]
|
||||
rescue EmptyCacheError => e
|
||||
error("#{var}: #{name}: #{e}")
|
||||
errors[name] += 1
|
||||
end
|
||||
|
@ -458,9 +475,8 @@ def run(hostname_vars)
|
|||
if changed
|
||||
File.write(HOSTS_PATH, hosts_content)
|
||||
end
|
||||
|
||||
rescue => e
|
||||
error("DNS lookup failed: #{e}")
|
||||
error("unhandled exception during run: #{e}")
|
||||
errors[nil] = 1
|
||||
ensure
|
||||
return errors
|
||||
|
|
Loading…
Reference in New Issue