Issue 989: handles transient node-not-found in nodeRunning

Sometimes on aws-ec2, the client returns null (i.e. node not found).
This fix will cause us retry with the correct node-id, rather than
"forgetting" the original nodeId and retrying with null each time.
This commit is contained in:
Aled Sage 2012-06-26 10:09:07 +01:00
parent 1c6e2c64d3
commit 65b813b1be
2 changed files with 106 additions and 5 deletions

View File

@ -57,7 +57,7 @@ public class ComputeServiceTimeoutsModule extends AbstractModule {
@Singleton
@Named(TIMEOUT_NODE_RUNNING)
protected Predicate<AtomicReference<NodeMetadata>> nodeRunning(AtomicNodeRunning statusRunning, Timeouts timeouts) {
return timeouts.nodeRunning == 0 ? statusRunning : new RetryablePredicate<AtomicReference<NodeMetadata>>(statusRunning,
return timeouts.nodeRunning == 0 ? statusRunning : new RetryablePredicateGuardingNull<NodeMetadata>(statusRunning,
timeouts.nodeRunning);
}
@ -74,7 +74,7 @@ public class ComputeServiceTimeoutsModule extends AbstractModule {
@Singleton
@Named(TIMEOUT_NODE_SUSPENDED)
protected Predicate<AtomicReference<NodeMetadata>> serverSuspended(AtomicNodeSuspended statusSuspended, Timeouts timeouts) {
return timeouts.nodeSuspended == 0 ? statusSuspended : new RetryablePredicate<AtomicReference<NodeMetadata>>(statusSuspended,
return timeouts.nodeSuspended == 0 ? statusSuspended : new RetryablePredicateGuardingNull<NodeMetadata>(statusSuspended,
timeouts.nodeSuspended);
}
@ -90,7 +90,7 @@ public class ComputeServiceTimeoutsModule extends AbstractModule {
@Singleton
@Named(TIMEOUT_IMAGE_AVAILABLE)
protected Predicate<AtomicReference<Image>> imageAvailable(AtomicImageAvailable statusAvailable, Timeouts timeouts) {
return timeouts.imageAvailable == 0 ? statusAvailable : new RetryablePredicate<AtomicReference<Image>>(statusAvailable,
return timeouts.imageAvailable == 0 ? statusAvailable : new RetryablePredicateGuardingNull<Image>(statusAvailable,
timeouts.imageAvailable);
}
@ -106,4 +106,46 @@ public class ComputeServiceTimeoutsModule extends AbstractModule {
protected void configure() {
}
/**
* Avoids "losing" the ComputeMetadata if client returns null temporarily (issue #989).
* Ensures we always pass a non-null to the wrapped predicate, but will propagate the null to
* the caller qt the end.
*
* @author Aled Sage
*/
private static class RetryablePredicateGuardingNull<T> implements Predicate<AtomicReference<T>> {
private class AtomicRefAndOrig {
private final T orig;
private final AtomicReference<T> ref;
AtomicRefAndOrig(T orig, AtomicReference<T> ref) {
this.orig = orig;
this.ref = ref;
}
}
private final RetryablePredicate<AtomicRefAndOrig> retryablePredicate;
public RetryablePredicateGuardingNull(final Predicate<AtomicReference<T>> predicate, long maxWait) {
Predicate<AtomicRefAndOrig> nonNullThingPredicate = new Predicate<AtomicRefAndOrig>() {
@Override
public boolean apply(AtomicRefAndOrig input) {
AtomicReference<T> ref = (input.ref.get() != null) ? input.ref : new AtomicReference<T>(input.orig);
try {
return predicate.apply(ref);
} finally {
input.ref.set(ref.get());
}
}
};
retryablePredicate = new RetryablePredicate<AtomicRefAndOrig>(nonNullThingPredicate, maxWait);
}
@Override
public boolean apply(AtomicReference<T> input) {
AtomicRefAndOrig refAndOrig = new AtomicRefAndOrig(input.get(), input);
return retryablePredicate.apply(refAndOrig);
}
}
}

View File

@ -31,10 +31,13 @@ import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import org.easymock.EasyMock;
import org.easymock.IAnswer;
import org.jclouds.compute.config.ComputeServiceTimeoutsModule;
import org.jclouds.compute.config.CustomizationResponse;
import org.jclouds.compute.domain.NodeMetadata;
import org.jclouds.compute.domain.NodeMetadataBuilder;
import org.jclouds.compute.domain.NodeMetadata.Status;
import org.jclouds.compute.domain.NodeMetadataBuilder;
import org.jclouds.compute.functions.TemplateOptionsToStatement;
import org.jclouds.compute.options.TemplateOptions;
import org.jclouds.compute.predicates.AtomicNodeRunning;
@ -45,7 +48,9 @@ import org.testng.Assert;
import org.testng.annotations.Test;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
@ -190,4 +195,58 @@ public class CustomizeNodeAndAddToGoodMapOrPutExceptionIntoBadMapTest {
// verify mocks
verify(initScriptRunnerFactory, openSocketFinder);
}
public void testRecoversWhenTemporarilyNodeNotFound() {
String nodeId = "myid";
InitializeRunScriptOnNodeOrPlaceInBadMap.Factory initScriptRunnerFactory = createMock(InitializeRunScriptOnNodeOrPlaceInBadMap.Factory.class);
OpenSocketFinder openSocketFinder = createMock(OpenSocketFinder.class);
Timeouts timeouts = new Timeouts();
Function<TemplateOptions, Statement> templateOptionsToStatement = new TemplateOptionsToStatement();
Set<NodeMetadata> goodNodes = Sets.newLinkedHashSet();
Map<NodeMetadata, Exception> badNodes = Maps.newLinkedHashMap();
Multimap<NodeMetadata, CustomizationResponse> customizationResponses = LinkedHashMultimap.create();
TemplateOptions options = new TemplateOptions();
final NodeMetadata pendingNode = new NodeMetadataBuilder().ids(nodeId).status(Status.PENDING).build();
final NodeMetadata runningNode = new NodeMetadataBuilder().ids(nodeId).status(Status.RUNNING).build();
GetNodeMetadataStrategy nodeClient = createMock(GetNodeMetadataStrategy.class);
AtomicNodeRunning nodeRunning = new AtomicNodeRunning(nodeClient);
Predicate<AtomicReference<NodeMetadata>> retryableNodeRunning = new ComputeServiceTimeoutsModule() {
public Predicate<AtomicReference<NodeMetadata>> nodeRunning(AtomicNodeRunning statusRunning, Timeouts timeouts) {
return super.nodeRunning(statusRunning, timeouts);
}
}.nodeRunning(nodeRunning, timeouts);
AtomicReference<NodeMetadata> atomicNode = new AtomicReference<NodeMetadata>(pendingNode);
// Simulate transient error: first call returns null; subsequent calls return the running node
EasyMock.expect(nodeClient.getNode(nodeId)).andAnswer(new IAnswer<NodeMetadata>() {
private int count = 0;
@Override
public NodeMetadata answer() throws Throwable {
count++;
if (count <= 1) {
return null;
} else {
return runningNode;
}
}
}).anyTimes();
// replay mocks
replay(initScriptRunnerFactory, openSocketFinder, nodeClient);
// run
new CustomizeNodeAndAddToGoodMapOrPutExceptionIntoBadMap(retryableNodeRunning, openSocketFinder, timeouts,
templateOptionsToStatement, initScriptRunnerFactory, options, atomicNode, goodNodes, badNodes,
customizationResponses).apply(atomicNode);
if (badNodes.size() > 0) Iterables.get(badNodes.values(), 0).printStackTrace();
assertEquals(badNodes.size(), 0);
assertEquals(goodNodes, ImmutableSet.of(runningNode));
assertEquals(customizationResponses.size(), 0);
// verify mocks
verify(initScriptRunnerFactory, openSocketFinder, nodeClient);
}
}