Make retries a bit smarter, clean up language to be gentler, and give up on parsing stdout for tunnel launch

This commit is contained in:
Megan Marsh 2020-04-24 11:09:00 -07:00
parent 3e1ddad0c7
commit 937a4859d4
2 changed files with 62 additions and 19 deletions

View File

@ -53,6 +53,14 @@ type TunnelDriver interface {
StopTunnel() StopTunnel()
} }
type RetryableTunnelError struct {
s string
}
func (e RetryableTunnelError) Error() string {
return "Tunnel start: " + e.s
}
type StepStartTunnel struct { type StepStartTunnel struct {
IAPConf *IAPConfig IAPConf *IAPConfig
CommConf *communicator.Config CommConf *communicator.Config
@ -190,8 +198,12 @@ func (s *StepStartTunnel) Run(ctx context.Context, state multistep.StateBag) mul
err = retry.Config{ err = retry.Config{
Tries: 11, Tries: 11,
ShouldRetry: func(err error) bool { ShouldRetry: func(err error) bool {
// TODO be stricter with retries. switch err.(type) {
return true case RetryableTunnelError:
return true
default:
return false
}
}, },
RetryDelay: (&retry.Backoff{InitialBackoff: 200 * time.Millisecond, MaxBackoff: 30 * time.Second, Multiplier: 2}).Linear, RetryDelay: (&retry.Backoff{InitialBackoff: 200 * time.Millisecond, MaxBackoff: 30 * time.Second, Multiplier: 2}).Linear,
}.Run(ctx, func(ctx context.Context) error { }.Run(ctx, func(ctx context.Context) error {

View File

@ -3,10 +3,10 @@
package googlecompute package googlecompute
import ( import (
"bufio"
"bytes" "bytes"
"context" "context"
"fmt" "fmt"
"io"
"log" "log"
"os/exec" "os/exec"
"strings" "strings"
@ -33,29 +33,60 @@ func (t *TunnelDriverLinux) StartTunnel(cancelCtx context.Context, tempScriptFil
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
err := cmd.Start() err := cmd.Start()
log.Printf("Waiting 30s for tunnel to create...")
if err != nil { if err != nil {
err := fmt.Errorf("Error calling gcloud sdk to launch IAP tunnel: %s", err := fmt.Errorf("Error calling gcloud sdk to launch IAP tunnel: %s",
err) err)
return err return err
} }
time.Sleep(10 * time.Second) // Give tunnel 30 seconds to either launch, or return an error.
// read stdout // Unfortunately, the SDK doesn't provide any official acknowledgment that
scanner := bufio.NewScanner(&stderr) // the tunnel is launched when it's not being run through a TTY so we
log.Printf("[start-iap-tunnel] stderr is:") // are just trusting here that 30s is enough to know whether the tunnel
for scanner.Scan() { // launch was going to fail. Yep, feels icky to me too. But I spent an
line := scanner.Text() // afternoon trying to figure out how to get the SDK to actually send
log.Println(line) // the "Listening on port [n]" line I see when I run it manually, and I
// can't justify spending more time than that on aesthetics.
for i := 0; i < 30; i++ {
time.Sleep(1 * time.Second)
if strings.Contains(line, "ERROR") { lineStderr, err := stderr.ReadString('\n')
errIdx := strings.Index(line, "ERROR:") if err != nil && err != io.EOF {
return fmt.Errorf("ERROR: %s", line[errIdx+7:]) log.Printf("Err from scanning stderr is %s", err)
return fmt.Errorf("Error reading stderr from tunnel launch: %s", err)
}
if lineStderr != "" {
log.Printf("stderr: %s", lineStderr)
}
lineStdout, err := stdout.ReadString('\n')
if err != nil && err != io.EOF {
log.Printf("Err from scanning stdout is %s", err)
return fmt.Errorf("Error reading stdout from tunnel launch: %s", err)
}
if lineStdout != "" {
log.Printf("stdout: %s", lineStdout)
}
if strings.Contains(lineStderr, "ERROR") {
// 4033: Either you don't have permission to access the instance,
// the instance doesn't exist, or the instance is stopped.
// The two sub-errors we may see while the permissions settle are
// "not authorized" and "failed to connect to backend," but after
// about a minute of retries this goes away and we're able to
// connect.
if strings.Contains(lineStderr, "4033") {
return RetryableTunnelError{lineStderr}
} else {
log.Printf("NOT RETRYABLE: %s", lineStderr)
return fmt.Errorf("Non-retryable tunnel error: %s", lineStderr)
}
return nil
} }
} }
if err := scanner.Err(); err != nil {
log.Printf("Error scanning tunnel stderr: %s", err) log.Printf("No error detected after tunnel launch; continuing...")
}
// Store successful command on step so we can access it to cancel it // Store successful command on step so we can access it to cancel it
// later. // later.
@ -68,10 +99,10 @@ func (t *TunnelDriverLinux) StopTunnel() {
log.Printf("Cleaning up the IAP tunnel...") log.Printf("Cleaning up the IAP tunnel...")
// Why not just cmd.Process.Kill()? I'm glad you asked. The gcloud // Why not just cmd.Process.Kill()? I'm glad you asked. The gcloud
// call spawns a python subprocess that listens on the port, and you // call spawns a python subprocess that listens on the port, and you
// need to use the process _group_ id to kill this process and its // need to use the process _group_ id to halt this process and its
// daemon child. We create the group ID with the syscall.SysProcAttr // daemon child. We create the group ID with the syscall.SysProcAttr
// call inside the retry loop above, and then store that ID on the // call inside the retry loop above, and then store that ID on the
// command so we can destroy it here. // command so we can halt it here.
err := syscall.Kill(-t.cmd.Process.Pid, syscall.SIGINT) err := syscall.Kill(-t.cmd.Process.Pid, syscall.SIGINT)
if err != nil { if err != nil {
log.Printf("Issue stopping IAP tunnel: %s", err) log.Printf("Issue stopping IAP tunnel: %s", err)