Reduce logging noise when stepping down as master before state recovery (#39950)

Reduces the logging noise from the state recovery component when there are duelling elections.

Relates to #32006
This commit is contained in:
Yannick Welsch 2019-03-15 17:14:00 +01:00
parent 4eb3683d65
commit c74111ff8e
1 changed files with 25 additions and 13 deletions

View File

@ -206,11 +206,20 @@ public class GatewayService extends AbstractLifecycleComponent implements Cluste
if (enforceRecoverAfterTime && recoverAfterTime != null) { if (enforceRecoverAfterTime && recoverAfterTime != null) {
if (scheduledRecovery.compareAndSet(false, true)) { if (scheduledRecovery.compareAndSet(false, true)) {
logger.info("delaying initial state recovery for [{}]. {}", recoverAfterTime, reason); logger.info("delaying initial state recovery for [{}]. {}", recoverAfterTime, reason);
threadPool.schedule(() -> { threadPool.schedule(new AbstractRunnable() {
@Override
public void onFailure(Exception e) {
logger.warn("delayed state recovery failed", e);
resetRecoveredFlags();
}
@Override
protected void doRun() {
if (recovered.compareAndSet(false, true)) { if (recovered.compareAndSet(false, true)) {
logger.info("recover_after_time [{}] elapsed. performing state recovery...", recoverAfterTime); logger.info("recover_after_time [{}] elapsed. performing state recovery...", recoverAfterTime);
recoveryRunnable.run(); recoveryRunnable.run();
} }
}
}, recoverAfterTime, ThreadPool.Names.GENERIC); }, recoverAfterTime, ThreadPool.Names.GENERIC);
} }
} else { } else {
@ -218,10 +227,8 @@ public class GatewayService extends AbstractLifecycleComponent implements Cluste
threadPool.generic().execute(new AbstractRunnable() { threadPool.generic().execute(new AbstractRunnable() {
@Override @Override
public void onFailure(final Exception e) { public void onFailure(final Exception e) {
logger.warn("Recovery failed", e); logger.warn("state recovery failed", e);
// we reset `recovered` in the listener don't reset it here otherwise there might be a race resetRecoveredFlags();
// that resets it to false while a new recover is already running?
GatewayService.this.onFailure("state recovery failed: " + e.getMessage());
} }
@Override @Override
@ -233,11 +240,9 @@ public class GatewayService extends AbstractLifecycleComponent implements Cluste
} }
} }
private void onFailure(final String message) { private void resetRecoveredFlags() {
recovered.set(false); recovered.set(false);
scheduledRecovery.set(false); scheduledRecovery.set(false);
// don't remove the block here, we don't want to allow anything in such a case
logger.info("metadata state not restored, reason: {}", message);
} }
class RecoverStateUpdateTask extends ClusterStateUpdateTask { class RecoverStateUpdateTask extends ClusterStateUpdateTask {
@ -257,10 +262,16 @@ public class GatewayService extends AbstractLifecycleComponent implements Cluste
logger.info("recovered [{}] indices into cluster_state", newState.metaData().indices().size()); logger.info("recovered [{}] indices into cluster_state", newState.metaData().indices().size());
} }
@Override
public void onNoLongerMaster(String source) {
logger.debug("stepped down as master before recovering state [{}]", source);
resetRecoveredFlags();
}
@Override @Override
public void onFailure(final String source, final Exception e) { public void onFailure(final String source, final Exception e) {
logger.info(() -> new ParameterizedMessage("unexpected failure during [{}]", source), e); logger.info(() -> new ParameterizedMessage("unexpected failure during [{}]", source), e);
GatewayService.this.onFailure("failed to update cluster state"); resetRecoveredFlags();
} }
} }
@ -280,7 +291,8 @@ public class GatewayService extends AbstractLifecycleComponent implements Cluste
@Override @Override
public void onFailure(final String msg) { public void onFailure(final String msg) {
GatewayService.this.onFailure(msg); logger.info("state recovery failed: {}", msg);
resetRecoveredFlags();
} }
} }