[Zen2] Fix test failures in diff-based publishing (#35684)
`testIncompatibleDiffResendsFullState` sometimes makes a 2-node cluster and then partitions one of the nodes from the leader, which makes the leader stand down. Then when the partition is removed the cluster re-forms but does so by sending full cluster states, not diffs, causing the test to fail. Additionally `testDiffBasedPublishing` sometimes fails if a publication is delivered out-of-order, wiping out a fresher last-received cluster state with a less-fresh one. This is fixed here by passing the received cluster state to the coordinator before recording it as the last-received one, relying on the coordinator's freshness checks.
This commit is contained in:
parent
c816347253
commit
cfdf666672
|
@ -201,8 +201,10 @@ public class PublicationTransportHandler {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} else if (sendFullVersion || !previousState.nodes().nodeExists(destination)) {
|
} else if (sendFullVersion || !previousState.nodes().nodeExists(destination)) {
|
||||||
|
logger.trace("sending full cluster state version {} to {}", newState.version(), destination);
|
||||||
PublicationTransportHandler.this.sendFullClusterState(newState, serializedStates, destination, responseActionListener);
|
PublicationTransportHandler.this.sendFullClusterState(newState, serializedStates, destination, responseActionListener);
|
||||||
} else {
|
} else {
|
||||||
|
logger.trace("sending cluster state diff for version {} to {}", newState.version(), destination);
|
||||||
PublicationTransportHandler.this.sendClusterStateDiff(newState, serializedDiffs, serializedStates, destination,
|
PublicationTransportHandler.this.sendClusterStateDiff(newState, serializedDiffs, serializedStates, destination,
|
||||||
responseActionListener);
|
responseActionListener);
|
||||||
}
|
}
|
||||||
|
@ -381,7 +383,6 @@ public class PublicationTransportHandler {
|
||||||
private PublishWithJoinResponse handleIncomingPublishRequest(BytesTransportRequest request) throws IOException {
|
private PublishWithJoinResponse handleIncomingPublishRequest(BytesTransportRequest request) throws IOException {
|
||||||
final Compressor compressor = CompressorFactory.compressor(request.bytes());
|
final Compressor compressor = CompressorFactory.compressor(request.bytes());
|
||||||
StreamInput in = request.bytes().streamInput();
|
StreamInput in = request.bytes().streamInput();
|
||||||
final ClusterState incomingState;
|
|
||||||
try {
|
try {
|
||||||
if (compressor != null) {
|
if (compressor != null) {
|
||||||
in = compressor.streamInput(in);
|
in = compressor.streamInput(in);
|
||||||
|
@ -390,11 +391,13 @@ public class PublicationTransportHandler {
|
||||||
in.setVersion(request.version());
|
in.setVersion(request.version());
|
||||||
// If true we received full cluster state - otherwise diffs
|
// If true we received full cluster state - otherwise diffs
|
||||||
if (in.readBoolean()) {
|
if (in.readBoolean()) {
|
||||||
incomingState = ClusterState.readFrom(in, transportService.getLocalNode());
|
final ClusterState incomingState = ClusterState.readFrom(in, transportService.getLocalNode());
|
||||||
fullClusterStateReceivedCount.incrementAndGet();
|
fullClusterStateReceivedCount.incrementAndGet();
|
||||||
logger.debug("received full cluster state version [{}] with size [{}]", incomingState.version(),
|
logger.debug("received full cluster state version [{}] with size [{}]", incomingState.version(),
|
||||||
request.bytes().length());
|
request.bytes().length());
|
||||||
|
final PublishWithJoinResponse response = handlePublishRequest.apply(new PublishRequest(incomingState));
|
||||||
lastSeenClusterState.set(incomingState);
|
lastSeenClusterState.set(incomingState);
|
||||||
|
return response;
|
||||||
} else {
|
} else {
|
||||||
final ClusterState lastSeen = lastSeenClusterState.get();
|
final ClusterState lastSeen = lastSeenClusterState.get();
|
||||||
if (lastSeen == null) {
|
if (lastSeen == null) {
|
||||||
|
@ -402,11 +405,13 @@ public class PublicationTransportHandler {
|
||||||
throw new IncompatibleClusterStateVersionException("have no local cluster state");
|
throw new IncompatibleClusterStateVersionException("have no local cluster state");
|
||||||
} else {
|
} else {
|
||||||
Diff<ClusterState> diff = ClusterState.readDiffFrom(in, lastSeen.nodes().getLocalNode());
|
Diff<ClusterState> diff = ClusterState.readDiffFrom(in, lastSeen.nodes().getLocalNode());
|
||||||
incomingState = diff.apply(lastSeen); // might throw IncompatibleClusterStateVersionException
|
final ClusterState incomingState = diff.apply(lastSeen); // might throw IncompatibleClusterStateVersionException
|
||||||
compatibleClusterStateDiffReceivedCount.incrementAndGet();
|
compatibleClusterStateDiffReceivedCount.incrementAndGet();
|
||||||
logger.debug("received diff cluster state version [{}] with uuid [{}], diff size [{}]",
|
logger.debug("received diff cluster state version [{}] with uuid [{}], diff size [{}]",
|
||||||
incomingState.version(), incomingState.stateUUID(), request.bytes().length());
|
incomingState.version(), incomingState.stateUUID(), request.bytes().length());
|
||||||
|
final PublishWithJoinResponse response = handlePublishRequest.apply(new PublishRequest(incomingState));
|
||||||
lastSeenClusterState.compareAndSet(lastSeen, incomingState);
|
lastSeenClusterState.compareAndSet(lastSeen, incomingState);
|
||||||
|
return response;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (IncompatibleClusterStateVersionException e) {
|
} catch (IncompatibleClusterStateVersionException e) {
|
||||||
|
@ -418,7 +423,5 @@ public class PublicationTransportHandler {
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.close(in);
|
IOUtils.close(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
return handlePublishRequest.apply(new PublishRequest(incomingState));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -784,19 +784,19 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
assertThat(value(cn.getLastAppliedClusterState()), is(finalValue));
|
assertThat(value(cn.getLastAppliedClusterState()), is(finalValue));
|
||||||
if (cn == leader) {
|
if (cn == leader) {
|
||||||
// leader does not update publish stats as it's not using the serialized state
|
// leader does not update publish stats as it's not using the serialized state
|
||||||
assertEquals(prePublishStats.get(cn).getFullClusterStateReceivedCount(),
|
assertEquals(cn.toString(), prePublishStats.get(cn).getFullClusterStateReceivedCount(),
|
||||||
postPublishStats.get(cn).getFullClusterStateReceivedCount());
|
postPublishStats.get(cn).getFullClusterStateReceivedCount());
|
||||||
assertEquals(prePublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount(),
|
assertEquals(cn.toString(), prePublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount(),
|
||||||
postPublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount());
|
postPublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount());
|
||||||
assertEquals(prePublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount(),
|
assertEquals(cn.toString(), prePublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount(),
|
||||||
postPublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount());
|
postPublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount());
|
||||||
} else {
|
} else {
|
||||||
// followers receive a diff
|
// followers receive a diff
|
||||||
assertEquals(prePublishStats.get(cn).getFullClusterStateReceivedCount(),
|
assertEquals(cn.toString(), prePublishStats.get(cn).getFullClusterStateReceivedCount(),
|
||||||
postPublishStats.get(cn).getFullClusterStateReceivedCount());
|
postPublishStats.get(cn).getFullClusterStateReceivedCount());
|
||||||
assertEquals(prePublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount() + 1,
|
assertEquals(cn.toString(), prePublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount() + 1,
|
||||||
postPublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount());
|
postPublishStats.get(cn).getCompatibleClusterStateDiffReceivedCount());
|
||||||
assertEquals(prePublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount(),
|
assertEquals(cn.toString(), prePublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount(),
|
||||||
postPublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount());
|
postPublishStats.get(cn).getIncompatibleClusterStateDiffReceivedCount());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -818,17 +818,21 @@ public class CoordinatorTests extends ESTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testIncompatibleDiffResendsFullState() {
|
public void testIncompatibleDiffResendsFullState() {
|
||||||
final Cluster cluster = new Cluster(randomIntBetween(2, 5));
|
final Cluster cluster = new Cluster(randomIntBetween(3, 5));
|
||||||
cluster.runRandomly();
|
cluster.runRandomly();
|
||||||
cluster.stabilise();
|
cluster.stabilise();
|
||||||
|
|
||||||
final ClusterNode leader = cluster.getAnyLeader();
|
final ClusterNode leader = cluster.getAnyLeader();
|
||||||
final ClusterNode follower = cluster.getAnyNodeExcept(leader);
|
final ClusterNode follower = cluster.getAnyNodeExcept(leader);
|
||||||
|
logger.info("--> blackholing {}", follower);
|
||||||
follower.blackhole();
|
follower.blackhole();
|
||||||
final PublishClusterStateStats prePublishStats = follower.coordinator.stats().getPublishStats();
|
final PublishClusterStateStats prePublishStats = follower.coordinator.stats().getPublishStats();
|
||||||
|
logger.info("--> submitting first value to {}", leader);
|
||||||
leader.submitValue(randomLong());
|
leader.submitValue(randomLong());
|
||||||
cluster.runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY + defaultMillis(PUBLISH_TIMEOUT_SETTING), "publish first state");
|
cluster.runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY + defaultMillis(PUBLISH_TIMEOUT_SETTING), "publish first state");
|
||||||
|
logger.info("--> healing {}", follower);
|
||||||
follower.heal();
|
follower.heal();
|
||||||
|
logger.info("--> submitting second value to {}", leader);
|
||||||
leader.submitValue(randomLong());
|
leader.submitValue(randomLong());
|
||||||
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
|
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
|
||||||
final PublishClusterStateStats postPublishStats = follower.coordinator.stats().getPublishStats();
|
final PublishClusterStateStats postPublishStats = follower.coordinator.stats().getPublishStats();
|
||||||
|
|
Loading…
Reference in New Issue