ec2 suspend nodes may take up to 2 minutes

This commit is contained in:
Adrian Cole 2010-11-07 22:42:34 +01:00
parent 5f5b01ad35
commit 1347165118
3 changed files with 78 additions and 52 deletions

View File

@ -26,6 +26,7 @@ import static org.jclouds.aws.ec2.reference.EC2Constants.PROPERTY_EC2_CC_AMIs;
import static org.jclouds.aws.ec2.reference.EC2Constants.PROPERTY_ELB_ENDPOINT; import static org.jclouds.aws.ec2.reference.EC2Constants.PROPERTY_ELB_ENDPOINT;
import static org.jclouds.aws.reference.AWSConstants.PROPERTY_AUTH_TAG; import static org.jclouds.aws.reference.AWSConstants.PROPERTY_AUTH_TAG;
import static org.jclouds.aws.reference.AWSConstants.PROPERTY_HEADER_TAG; import static org.jclouds.aws.reference.AWSConstants.PROPERTY_HEADER_TAG;
import static org.jclouds.compute.reference.ComputeServiceConstants.PROPERTY_TIMEOUT_NODE_SUSPENDED;
import java.util.Properties; import java.util.Properties;
@ -49,11 +50,15 @@ public class EC2PropertiesBuilder extends PropertiesBuilder {
properties.setProperty(PROPERTY_EC2_AMI_OWNERS, "137112412989,063491364108,099720109477,411009282317"); properties.setProperty(PROPERTY_EC2_AMI_OWNERS, "137112412989,063491364108,099720109477,411009282317");
// amis that work with the cluster instances // amis that work with the cluster instances
properties.setProperty(PROPERTY_EC2_CC_AMIs, "us-east-1/ami-7ea24a17"); properties.setProperty(PROPERTY_EC2_CC_AMIs, "us-east-1/ami-7ea24a17");
// sometimes, like in ec2, stop takes a very long time, perhaps
// due to volume management. one example spent 2 minutes moving
// from stopping->stopped state on an ec2 micro
properties.setProperty(PROPERTY_TIMEOUT_NODE_SUSPENDED, 120 * 1000 + "");
// auth fail sometimes happens in EC2, as the rc.local script that injects the // auth fail sometimes happens in EC2, as the rc.local script that injects the
// authorized key executes after ssh has started // authorized key executes after ssh has started
properties.setProperty("jclouds.ssh.max_retries", "7"); properties.setProperty("jclouds.ssh.max_retries", "7");
properties.setProperty("jclouds.ssh.retryable_messages", properties.setProperty("jclouds.ssh.retryable_messages",
"Auth fail,invalid data,End of IO Stream Read,Connection reset,socket is not established"); "Auth fail,invalid data,End of IO Stream Read,Connection reset,socket is not established");
return properties; return properties;
} }

View File

@ -137,7 +137,7 @@ public interface ComputeService {
* default, equivalent to {@code templateBuilder().any().options(templateOptions)}. * default, equivalent to {@code templateBuilder().any().options(templateOptions)}.
*/ */
Set<? extends NodeMetadata> runNodesWithTag(String tag, int count, TemplateOptions templateOptions) Set<? extends NodeMetadata> runNodesWithTag(String tag, int count, TemplateOptions templateOptions)
throws RunNodesException; throws RunNodesException;
/** /**
* Like {@link ComputeService#runNodesWithTag(String,int,TemplateOptions)}, except that the * Like {@link ComputeService#runNodesWithTag(String,int,TemplateOptions)}, except that the
@ -148,6 +148,10 @@ public interface ComputeService {
/** /**
* resume the node from {@link org.jclouds.compute.domain.NodeState#SUSPENDED suspended} state, * resume the node from {@link org.jclouds.compute.domain.NodeState#SUSPENDED suspended} state,
* given its id. * given its id.
*
* <h4>note</h4>
*
* affected nodes may not resume with the same IP address(es)
*/ */
void resumeNode(String id); void resumeNode(String id);
@ -155,6 +159,10 @@ public interface ComputeService {
* nodes matching the filter are treated as a logical set. Using the resume command, you can save * nodes matching the filter are treated as a logical set. Using the resume command, you can save
* time by resumeing the nodes in parallel. * time by resumeing the nodes in parallel.
* *
* <h4>note</h4>
*
* affected nodes may not resume with the same IP address(es)
*
* @throws UnsupportedOperationException * @throws UnsupportedOperationException
* if the underlying provider doesn't support suspend/resume * if the underlying provider doesn't support suspend/resume
*/ */
@ -164,6 +172,10 @@ public interface ComputeService {
* suspend the node, given its id. This will result in * suspend the node, given its id. This will result in
* {@link org.jclouds.compute.domain.NodeState#SUSPENDED suspended} state. * {@link org.jclouds.compute.domain.NodeState#SUSPENDED suspended} state.
* *
* <h4>note</h4>
*
* affected nodes may not resume with the same IP address(es)
*
* @throws UnsupportedOperationException * @throws UnsupportedOperationException
* if the underlying provider doesn't support suspend/resume * if the underlying provider doesn't support suspend/resume
*/ */
@ -173,6 +185,9 @@ public interface ComputeService {
* nodes matching the filter are treated as a logical set. Using the suspend command, you can * nodes matching the filter are treated as a logical set. Using the suspend command, you can
* save time by suspending the nodes in parallel. * save time by suspending the nodes in parallel.
* *
* <h4>note</h4>
*
* affected nodes may not resume with the same IP address(es)
*/ */
void suspendNodesMatching(Predicate<NodeMetadata> filter); void suspendNodesMatching(Predicate<NodeMetadata> filter);
@ -224,7 +239,7 @@ public interface ComputeService {
* @see org.jclouds.compute.predicates.NodePredicates#runningWithTag(String) * @see org.jclouds.compute.predicates.NodePredicates#runningWithTag(String)
*/ */
Map<? extends NodeMetadata, ExecResponse> runScriptOnNodesMatching(Predicate<NodeMetadata> filter, Payload runScript) Map<? extends NodeMetadata, ExecResponse> runScriptOnNodesMatching(Predicate<NodeMetadata> filter, Payload runScript)
throws RunScriptOnNodesException; throws RunScriptOnNodesException;
/** /**
* Run the script on all nodes with the specific tag. * Run the script on all nodes with the specific tag.
@ -243,6 +258,6 @@ public interface ComputeService {
* @see org.jclouds.io.Payloads * @see org.jclouds.io.Payloads
*/ */
Map<? extends NodeMetadata, ExecResponse> runScriptOnNodesMatching(Predicate<NodeMetadata> filter, Map<? extends NodeMetadata, ExecResponse> runScriptOnNodesMatching(Predicate<NodeMetadata> filter,
Payload runScript, RunScriptOptions options) throws RunScriptOnNodesException; Payload runScript, RunScriptOptions options) throws RunScriptOnNodesException;
} }

View File

@ -161,8 +161,8 @@ public abstract class BaseComputeServiceLiveTest {
if (context != null) if (context != null)
context.close(); context.close();
Properties props = setupProperties(); Properties props = setupProperties();
context = new ComputeServiceContextFactory().createContext(provider, ImmutableSet.of(new Log4JLoggingModule(), context = new ComputeServiceContextFactory().createContext(provider,
getSshModule()), props); ImmutableSet.of(new Log4JLoggingModule(), getSshModule()), props);
client = context.getComputeService(); client = context.getComputeService();
} }
@ -173,8 +173,8 @@ public abstract class BaseComputeServiceLiveTest {
public void testCorrectAuthException() throws Exception { public void testCorrectAuthException() throws Exception {
ComputeServiceContext context = null; ComputeServiceContext context = null;
try { try {
context = new ComputeServiceContextFactory().createContext(provider, "MOMMA", "MIA", ImmutableSet context = new ComputeServiceContextFactory().createContext(provider, "MOMMA", "MIA",
.<Module> of(new Log4JLoggingModule())); ImmutableSet.<Module> of(new Log4JLoggingModule()));
context.getComputeService().listNodes(); context.getComputeService().listNodes();
} catch (AuthorizationException e) { } catch (AuthorizationException e) {
throw e; throw e;
@ -217,7 +217,7 @@ public abstract class BaseComputeServiceLiveTest {
OperatingSystem os = get(nodes, 0).getOperatingSystem(); OperatingSystem os = get(nodes, 0).getOperatingSystem();
try { try {
Map<? extends NodeMetadata, ExecResponse> responses = runScriptWithCreds(tag, os, new Credentials( Map<? extends NodeMetadata, ExecResponse> responses = runScriptWithCreds(tag, os, new Credentials(
good.identity, "romeo")); good.identity, "romeo"));
assert false : "shouldn't pass with a bad password\n" + responses; assert false : "shouldn't pass with a bad password\n" + responses;
} catch (RunScriptOnNodesException e) { } catch (RunScriptOnNodesException e) {
assert getRootCause(e).getMessage().contains("Auth fail") : e; assert getRootCause(e).getMessage().contains("Auth fail") : e;
@ -276,7 +276,7 @@ public abstract class BaseComputeServiceLiveTest {
template = buildTemplate(client.templateBuilder()); template = buildTemplate(client.templateBuilder());
template.getOptions().installPrivateKey(keyPair.get("private")).authorizePublicKey(keyPair.get("public")) template.getOptions().installPrivateKey(keyPair.get("private")).authorizePublicKey(keyPair.get("public"))
.runScript(buildScript(template.getImage().getOperatingSystem())); .runScript(buildScript(template.getImage().getOperatingSystem()));
} }
protected void checkImageIdMatchesTemplate(NodeMetadata node) { protected void checkImageIdMatchesTemplate(NodeMetadata node) {
@ -287,8 +287,8 @@ public abstract class BaseComputeServiceLiveTest {
protected void checkOsMatchesTemplate(NodeMetadata node) { protected void checkOsMatchesTemplate(NodeMetadata node) {
if (node.getOperatingSystem() != null) if (node.getOperatingSystem() != null)
assert node.getOperatingSystem().getFamily().equals(template.getImage().getOperatingSystem().getFamily()) : String assert node.getOperatingSystem().getFamily().equals(template.getImage().getOperatingSystem().getFamily()) : String
.format("expecting family %s but got %s", template.getImage().getOperatingSystem().getFamily(), node .format("expecting family %s but got %s", template.getImage().getOperatingSystem().getFamily(),
.getOperatingSystem()); node.getOperatingSystem());
} }
void assertLocationSameOrChild(Location test, Location expected) { void assertLocationSameOrChild(Location test, Location expected) {
@ -320,10 +320,11 @@ public abstract class BaseComputeServiceLiveTest {
} }
protected Map<? extends NodeMetadata, ExecResponse> runScriptWithCreds(final String tag, OperatingSystem os, protected Map<? extends NodeMetadata, ExecResponse> runScriptWithCreds(final String tag, OperatingSystem os,
Credentials creds) throws RunScriptOnNodesException { Credentials creds) throws RunScriptOnNodesException {
try { try {
return client.runScriptOnNodesMatching(runningWithTag(tag), newStringPayload(buildScript(os).render( return client.runScriptOnNodesMatching(runningWithTag(tag),
OsFamily.UNIX)), overrideCredentialsWith(creds).nameTask("runScriptWithCreds")); newStringPayload(buildScript(os).render(OsFamily.UNIX)),
overrideCredentialsWith(creds).nameTask("runScriptWithCreds"));
} catch (SshException e) { } catch (SshException e) {
throw e; throw e;
} }
@ -353,16 +354,16 @@ public abstract class BaseComputeServiceLiveTest {
@Test(enabled = true, dependsOnMethods = "testCreateAnotherNodeWithANewContextToEnsureSharedMemIsntRequired") @Test(enabled = true, dependsOnMethods = "testCreateAnotherNodeWithANewContextToEnsureSharedMemIsntRequired")
public void testGet() throws Exception { public void testGet() throws Exception {
Map<String, ? extends NodeMetadata> metadataMap = newLinkedHashMap(uniqueIndex(filter(client Map<String, ? extends NodeMetadata> metadataMap = newLinkedHashMap(uniqueIndex(
.listNodesDetailsMatching(all()), and(withTag(tag), not(TERMINATED))), filter(client.listNodesDetailsMatching(all()), and(withTag(tag), not(TERMINATED))),
new Function<NodeMetadata, String>() { new Function<NodeMetadata, String>() {
@Override @Override
public String apply(NodeMetadata from) { public String apply(NodeMetadata from) {
return from.getId(); return from.getId();
} }
})); }));
for (NodeMetadata node : nodes) { for (NodeMetadata node : nodes) {
metadataMap.remove(node.getId()); metadataMap.remove(node.getId());
NodeMetadata metadata = client.getNodeMetadata(node.getId()); NodeMetadata metadata = client.getNodeMetadata(node.getId());
@ -372,15 +373,16 @@ public abstract class BaseComputeServiceLiveTest {
checkImageIdMatchesTemplate(metadata); checkImageIdMatchesTemplate(metadata);
checkOsMatchesTemplate(metadata); checkOsMatchesTemplate(metadata);
assertEquals(metadata.getState(), NodeState.RUNNING); assertEquals(metadata.getState(), NodeState.RUNNING);
assertEquals(metadata.getPrivateAddresses(), node.getPrivateAddresses()); // due to DHCP the addresses can actually change in-between runs.
assertEquals(metadata.getPublicAddresses(), node.getPublicAddresses()); assertEquals(metadata.getPrivateAddresses().size(), node.getPrivateAddresses().size());
assertEquals(metadata.getPublicAddresses().size(), node.getPublicAddresses().size());
} }
assertNodeZero(metadataMap.values()); assertNodeZero(metadataMap.values());
} }
protected void assertNodeZero(Collection<? extends NodeMetadata> metadataSet) { protected void assertNodeZero(Collection<? extends NodeMetadata> metadataSet) {
assert metadataSet.size() == 0 : String.format("nodes left in set: [%s] which didn't match set: [%s]", assert metadataSet.size() == 0 : String.format("nodes left in set: [%s] which didn't match set: [%s]",
metadataSet, nodes); metadataSet, nodes);
} }
@Test(enabled = true, dependsOnMethods = "testGet") @Test(enabled = true, dependsOnMethods = "testGet")
@ -393,16 +395,20 @@ public abstract class BaseComputeServiceLiveTest {
@Test(enabled = true, dependsOnMethods = "testReboot") @Test(enabled = true, dependsOnMethods = "testReboot")
public void testSuspendResume() throws Exception { public void testSuspendResume() throws Exception {
client.suspendNodesMatching(withTag(tag)); client.suspendNodesMatching(withTag(tag));
Set<? extends NodeMetadata> stoppedNodes = refreshNodes(); Set<? extends NodeMetadata> stoppedNodes = refreshNodes();
assert Iterables.all(stoppedNodes, new Predicate<NodeMetadata>() { assert Iterables.all(stoppedNodes, new Predicate<NodeMetadata>() {
@Override @Override
public boolean apply(NodeMetadata input) { public boolean apply(NodeMetadata input) {
return input.getState() == NodeState.SUSPENDED; boolean returnVal = input.getState() == NodeState.SUSPENDED;
if (!returnVal)
System.err.printf("warning: node %s in state %s%n", input.getId(), input.getState());
return returnVal;
} }
}) : nodes; }) : stoppedNodes;
client.resumeNodesMatching(withTag(tag)); client.resumeNodesMatching(withTag(tag));
testGet(); testGet();
@ -462,11 +468,11 @@ public abstract class BaseComputeServiceLiveTest {
} }
template = client.templateBuilder().options(blockOnComplete(false).blockOnPort(8080, 600).inboundPorts(22, 8080)) template = client.templateBuilder().options(blockOnComplete(false).blockOnPort(8080, 600).inboundPorts(22, 8080))
.build(); .build();
// note this is a dependency on the template resolution // note this is a dependency on the template resolution
template.getOptions().runScript( template.getOptions().runScript(
RunScriptData.createScriptInstallAndStartJBoss(keyPair.get("public"), template.getImage() RunScriptData.createScriptInstallAndStartJBoss(keyPair.get("public"), template.getImage()
.getOperatingSystem())); .getOperatingSystem()));
try { try {
NodeMetadata node = getOnlyElement(client.runNodesWithTag(tag, 1, template)); NodeMetadata node = getOnlyElement(client.runNodesWithTag(tag, 1, template));
@ -500,26 +506,26 @@ public abstract class BaseComputeServiceLiveTest {
assert location != location.getParent() : location; assert location != location.getParent() : location;
assert location.getScope() != null : location; assert location.getScope() != null : location;
switch (location.getScope()) { switch (location.getScope()) {
case PROVIDER: case PROVIDER:
assertProvider(location); assertProvider(location);
break; break;
case REGION: case REGION:
assertProvider(location.getParent()); assertProvider(location.getParent());
break; break;
case ZONE: case ZONE:
Location provider = location.getParent().getParent(); Location provider = location.getParent().getParent();
// zone can be a direct descendant of provider // zone can be a direct descendant of provider
if (provider == null) if (provider == null)
provider = location.getParent(); provider = location.getParent();
assertProvider(provider); assertProvider(provider);
break; break;
case HOST: case HOST:
Location provider2 = location.getParent().getParent().getParent(); Location provider2 = location.getParent().getParent().getParent();
// zone can be a direct descendant of provider // zone can be a direct descendant of provider
if (provider2 == null) if (provider2 == null)
provider2 = location.getParent().getParent(); provider2 = location.getParent().getParent();
assertProvider(provider2); assertProvider(provider2);
break; break;
} }
} }
} }
@ -607,7 +613,7 @@ public abstract class BaseComputeServiceLiveTest {
assertEquals(hello.getOutput().trim(), "hello"); assertEquals(hello.getOutput().trim(), "hello");
ExecResponse exec = ssh.exec("java -version"); ExecResponse exec = ssh.exec("java -version");
assert exec.getError().indexOf("1.6") != -1 || exec.getOutput().indexOf("1.6") != -1 : exec + "\n" assert exec.getError().indexOf("1.6") != -1 || exec.getOutput().indexOf("1.6") != -1 : exec + "\n"
+ ssh.exec("cat /tmp/bootstrap/stdout.log /tmp/bootstrap/stderr.log"); + ssh.exec("cat /tmp/bootstrap/stdout.log /tmp/bootstrap/stderr.log");
} finally { } finally {
if (ssh != null) if (ssh != null)
ssh.disconnect(); ssh.disconnect();