From fc438e5a722fc172dffb67fc429f48e97278ac64 Mon Sep 17 00:00:00 2001 From: samjhecht Date: Wed, 8 Jul 2015 14:14:20 -0700 Subject: [PATCH] add alert on errors polling for rules reset retrystarttime after alerting --- docs/content/configuration/coordinator.md | 1 + .../metadata/MetadataRuleManagerConfig.java | 8 ++++++++ .../metadata/SQLMetadataRuleManager.java | 20 +++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/docs/content/configuration/coordinator.md b/docs/content/configuration/coordinator.md index 19fa8103893..ad1e0d79d7f 100644 --- a/docs/content/configuration/coordinator.md +++ b/docs/content/configuration/coordinator.md @@ -37,6 +37,7 @@ The coordinator node uses several of the global configs in [Configuration](../co |`druid.manager.segment.pollDuration`|The duration between polls the Coordinator does for updates to the set of active segments. Generally defines the amount of lag time it can take for the coordinator to notice new segments.|PT1M| |`druid.manager.rules.pollDuration`|The duration between polls the Coordinator does for updates to the set of active rules. Generally defines the amount of lag time it can take for the coordinator to notice rules.|PT1M| |`druid.manager.rules.defaultTier`|The default tier from which default rules will be loaded from.|_default| +|`druid.manager.rules.alertThreshold`|The duration after a failed poll upon which an alert should be emitted.|PT10M| Dynamic Configuration --------------------- diff --git a/server/src/main/java/io/druid/metadata/MetadataRuleManagerConfig.java b/server/src/main/java/io/druid/metadata/MetadataRuleManagerConfig.java index 86cce3f6ab5..b920a8946ba 100644 --- a/server/src/main/java/io/druid/metadata/MetadataRuleManagerConfig.java +++ b/server/src/main/java/io/druid/metadata/MetadataRuleManagerConfig.java @@ -30,6 +30,9 @@ public class MetadataRuleManagerConfig @JsonProperty private Period pollDuration = new Period("PT1M"); + @JsonProperty + private Period alertThreshold = new Period("PT10M"); + public String getDefaultRule() { return defaultRule; @@ -39,4 +42,9 @@ public class MetadataRuleManagerConfig { return pollDuration; } + + public Period getAlertThreshold() + { + return alertThreshold; + } } diff --git a/server/src/main/java/io/druid/metadata/SQLMetadataRuleManager.java b/server/src/main/java/io/druid/metadata/SQLMetadataRuleManager.java index 9a4f82b9e3a..d4d9e4779d6 100644 --- a/server/src/main/java/io/druid/metadata/SQLMetadataRuleManager.java +++ b/server/src/main/java/io/druid/metadata/SQLMetadataRuleManager.java @@ -33,6 +33,7 @@ import com.metamx.common.Pair; import com.metamx.common.lifecycle.LifecycleStart; import com.metamx.common.lifecycle.LifecycleStop; import com.metamx.common.logger.Logger; +import com.metamx.emitter.EmittingLogger; import io.druid.audit.AuditEntry; import io.druid.audit.AuditInfo; import io.druid.audit.AuditManager; @@ -67,6 +68,8 @@ import java.util.concurrent.atomic.AtomicReference; @ManageLifecycle public class SQLMetadataRuleManager implements MetadataRuleManager { + + public static void createDefaultRule( final IDBI dbi, final String ruleTable, @@ -126,7 +129,7 @@ public class SQLMetadataRuleManager implements MetadataRuleManager } } - private static final Logger log = new Logger(SQLMetadataRuleManager.class); + private static final EmittingLogger log = new EmittingLogger(SQLMetadataRuleManager.class); private final ObjectMapper jsonMapper; private final Supplier config; @@ -142,6 +145,8 @@ public class SQLMetadataRuleManager implements MetadataRuleManager private volatile ListeningScheduledExecutorService exec = null; private volatile ListenableFuture future = null; + private volatile long retryStartTime = 0; + @Inject public SQLMetadataRuleManager( @Json ObjectMapper jsonMapper, @@ -287,9 +292,20 @@ public class SQLMetadataRuleManager implements MetadataRuleManager log.info("Polled and found rules for %,d datasource(s)", newRules.size()); rules.set(newRules); + retryStartTime = 0; } catch (Exception e) { - log.error(e, "Exception while polling for rules"); + if (retryStartTime == 0) { + retryStartTime = System.currentTimeMillis(); + } + + if (System.currentTimeMillis() - retryStartTime > config.get().getAlertThreshold().getMillis()) { + log.makeAlert(e, "Exception while polling for rules") + .emit(); + retryStartTime = 0; + } else { + log.error(e, "Exception while polling for rules"); + } } }