retry spot instance creation when an "Invalid IAM Instance Profile name" error pops up (#9810)

PutRolePolicy & AddRoleToInstanceProfile are eventually consistent but it is not possible to wait for them to be done here: 0785c2f6fc/builder/amazon/common/step_iam_instance_profile.go (L117-L134) which was causing the `CreateFleet` to fail (100% for me). So for now we retry a bit later. Waiting 5 seconds after the previously linked code also fixed this. Test file: ```json { "builders": [ { "type": "amazon-ebs", "region": "eu-west-1", "ami_name": "ubuntu-16.04 test {{timestamp}}", "ami_description": "Ubuntu 16.04 LTS - expand root partition", "source_ami_filter": { "filters": { "virtualization-type": "hvm", "name": "ubuntu/images/*/ubuntu-xenial-16.04-amd64-server-*", "root-device-type": "ebs" }, "owners": [ "099720109477" ], "most_recent": true }, "spot_price": "0.03", "spot_instance_types": [ "t2.small" ], "encrypt_boot": true, "ssh_username": "ubuntu", "ssh_interface": "session_manager", "temporary_iam_instance_profile_policy_document": { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "*" ], "Resource": "*" } ] }, "communicator": "ssh" } ]} ```
2020-08-25 10:10:32 +02:00 · 2020-08-25 10:10:32 +02:00 · a0c09e85df
commit a0c09e85df
parent 1252658848
1 changed files with 31 additions and 10 deletions
--- a/builder/amazon/common/step_run_spot_instance.go
+++ b/builder/amazon/common/step_run_spot_instance.go
@ -6,9 +6,11 @@ import (
 	"fmt"
 	"io/ioutil"
 	"log"
+	"strings"
 	"time"

 	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/aws/request"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/hashicorp/packer/common/random"
 	"github.com/hashicorp/packer/common/retry"
@ -278,23 +280,39 @@ func (s *StepRunSpotInstance) Run(ctx context.Context, state multistep.StateBag)
 		Type: aws.String("instant"),
 	}

+	var createOutput *ec2.CreateFleetOutput
+
+	err = retry.Config{
+		Tries: 11,
+		ShouldRetry: func(err error) bool {
+			if strings.Contains(err.Error(), "Invalid IAM Instance Profile name") {
+				// eventual consistency of the profile. PutRolePolicy &
+				// AddRoleToInstanceProfile are eventually consistent and once
+				// we can wait on those operations, this can be removed.
+				return true
+			}
+			return request.IsErrorRetryable(err)
+		},
+		RetryDelay: (&retry.Backoff{InitialBackoff: 500 * time.Millisecond, MaxBackoff: 30 * time.Second, Multiplier: 2}).Linear,
+	}.Run(ctx, func(ctx context.Context) error {
+		createOutput, err = ec2conn.CreateFleet(createFleetInput)
+
+		if err == nil && createOutput.Errors != nil {
+			err = fmt.Errorf("errors: %v", createOutput.Errors)
+		}
+		if err != nil {
+			log.Printf("create request failed %v", err)
+		}
+		return err
+	})
+
 	// Create the request for the spot instance.
-	req, createOutput := ec2conn.CreateFleetRequest(createFleetInput)
-	ui.Message(fmt.Sprintf("Sending spot request (%s)...", req.RequestID))
-	// Actually send the spot connection request.
-	err = req.Send()
 	if err != nil {
 		if createOutput.FleetId != nil {
 			err = fmt.Errorf("Error waiting for fleet request (%s): %s", *createOutput.FleetId, err)
 		} else {
 			err = fmt.Errorf("Error waiting for fleet request: %s", err)
 		}
-		state.Put("error", err)
-		ui.Error(err.Error())
-		return multistep.ActionHalt
-	}
-
-	if len(createOutput.Instances) == 0 {
 		// We can end up with errors because one of the allowed availability
 		// zones doesn't have one of the allowed instance types; as long as
 		// an instance is launched, these errors aren't important.
@ -308,6 +326,9 @@ func (s *StepRunSpotInstance) Run(ctx context.Context, state multistep.StateBag)
 			ui.Error(err.Error())
 			return multistep.ActionHalt
 		}
+		state.Put("error", err)
+		ui.Error(err.Error())
+		return multistep.ActionHalt
 	}

 	instanceId = *createOutput.Instances[0].InstanceIds[0]