Make retries a bit smarter, clean up language to be gentler, and give up on parsing stdout for tunnel launch
This commit is contained in:
parent
3e1ddad0c7
commit
937a4859d4
|
@ -53,6 +53,14 @@ type TunnelDriver interface {
|
||||||
StopTunnel()
|
StopTunnel()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type RetryableTunnelError struct {
|
||||||
|
s string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e RetryableTunnelError) Error() string {
|
||||||
|
return "Tunnel start: " + e.s
|
||||||
|
}
|
||||||
|
|
||||||
type StepStartTunnel struct {
|
type StepStartTunnel struct {
|
||||||
IAPConf *IAPConfig
|
IAPConf *IAPConfig
|
||||||
CommConf *communicator.Config
|
CommConf *communicator.Config
|
||||||
|
@ -190,8 +198,12 @@ func (s *StepStartTunnel) Run(ctx context.Context, state multistep.StateBag) mul
|
||||||
err = retry.Config{
|
err = retry.Config{
|
||||||
Tries: 11,
|
Tries: 11,
|
||||||
ShouldRetry: func(err error) bool {
|
ShouldRetry: func(err error) bool {
|
||||||
// TODO be stricter with retries.
|
switch err.(type) {
|
||||||
return true
|
case RetryableTunnelError:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
},
|
},
|
||||||
RetryDelay: (&retry.Backoff{InitialBackoff: 200 * time.Millisecond, MaxBackoff: 30 * time.Second, Multiplier: 2}).Linear,
|
RetryDelay: (&retry.Backoff{InitialBackoff: 200 * time.Millisecond, MaxBackoff: 30 * time.Second, Multiplier: 2}).Linear,
|
||||||
}.Run(ctx, func(ctx context.Context) error {
|
}.Run(ctx, func(ctx context.Context) error {
|
||||||
|
|
|
@ -3,10 +3,10 @@
|
||||||
package googlecompute
|
package googlecompute
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"log"
|
"log"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -33,29 +33,60 @@ func (t *TunnelDriverLinux) StartTunnel(cancelCtx context.Context, tempScriptFil
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
|
||||||
err := cmd.Start()
|
err := cmd.Start()
|
||||||
log.Printf("Waiting 30s for tunnel to create...")
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err := fmt.Errorf("Error calling gcloud sdk to launch IAP tunnel: %s",
|
err := fmt.Errorf("Error calling gcloud sdk to launch IAP tunnel: %s",
|
||||||
err)
|
err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
time.Sleep(10 * time.Second)
|
// Give tunnel 30 seconds to either launch, or return an error.
|
||||||
// read stdout
|
// Unfortunately, the SDK doesn't provide any official acknowledgment that
|
||||||
scanner := bufio.NewScanner(&stderr)
|
// the tunnel is launched when it's not being run through a TTY so we
|
||||||
log.Printf("[start-iap-tunnel] stderr is:")
|
// are just trusting here that 30s is enough to know whether the tunnel
|
||||||
for scanner.Scan() {
|
// launch was going to fail. Yep, feels icky to me too. But I spent an
|
||||||
line := scanner.Text()
|
// afternoon trying to figure out how to get the SDK to actually send
|
||||||
log.Println(line)
|
// the "Listening on port [n]" line I see when I run it manually, and I
|
||||||
|
// can't justify spending more time than that on aesthetics.
|
||||||
|
for i := 0; i < 30; i++ {
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
if strings.Contains(line, "ERROR") {
|
lineStderr, err := stderr.ReadString('\n')
|
||||||
errIdx := strings.Index(line, "ERROR:")
|
if err != nil && err != io.EOF {
|
||||||
return fmt.Errorf("ERROR: %s", line[errIdx+7:])
|
log.Printf("Err from scanning stderr is %s", err)
|
||||||
|
return fmt.Errorf("Error reading stderr from tunnel launch: %s", err)
|
||||||
|
}
|
||||||
|
if lineStderr != "" {
|
||||||
|
log.Printf("stderr: %s", lineStderr)
|
||||||
|
}
|
||||||
|
|
||||||
|
lineStdout, err := stdout.ReadString('\n')
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
log.Printf("Err from scanning stdout is %s", err)
|
||||||
|
return fmt.Errorf("Error reading stdout from tunnel launch: %s", err)
|
||||||
|
}
|
||||||
|
if lineStdout != "" {
|
||||||
|
log.Printf("stdout: %s", lineStdout)
|
||||||
|
}
|
||||||
|
|
||||||
|
if strings.Contains(lineStderr, "ERROR") {
|
||||||
|
// 4033: Either you don't have permission to access the instance,
|
||||||
|
// the instance doesn't exist, or the instance is stopped.
|
||||||
|
// The two sub-errors we may see while the permissions settle are
|
||||||
|
// "not authorized" and "failed to connect to backend," but after
|
||||||
|
// about a minute of retries this goes away and we're able to
|
||||||
|
// connect.
|
||||||
|
if strings.Contains(lineStderr, "4033") {
|
||||||
|
return RetryableTunnelError{lineStderr}
|
||||||
|
} else {
|
||||||
|
log.Printf("NOT RETRYABLE: %s", lineStderr)
|
||||||
|
return fmt.Errorf("Non-retryable tunnel error: %s", lineStderr)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if err := scanner.Err(); err != nil {
|
|
||||||
log.Printf("Error scanning tunnel stderr: %s", err)
|
log.Printf("No error detected after tunnel launch; continuing...")
|
||||||
}
|
|
||||||
|
|
||||||
// Store successful command on step so we can access it to cancel it
|
// Store successful command on step so we can access it to cancel it
|
||||||
// later.
|
// later.
|
||||||
|
@ -68,10 +99,10 @@ func (t *TunnelDriverLinux) StopTunnel() {
|
||||||
log.Printf("Cleaning up the IAP tunnel...")
|
log.Printf("Cleaning up the IAP tunnel...")
|
||||||
// Why not just cmd.Process.Kill()? I'm glad you asked. The gcloud
|
// Why not just cmd.Process.Kill()? I'm glad you asked. The gcloud
|
||||||
// call spawns a python subprocess that listens on the port, and you
|
// call spawns a python subprocess that listens on the port, and you
|
||||||
// need to use the process _group_ id to kill this process and its
|
// need to use the process _group_ id to halt this process and its
|
||||||
// daemon child. We create the group ID with the syscall.SysProcAttr
|
// daemon child. We create the group ID with the syscall.SysProcAttr
|
||||||
// call inside the retry loop above, and then store that ID on the
|
// call inside the retry loop above, and then store that ID on the
|
||||||
// command so we can destroy it here.
|
// command so we can halt it here.
|
||||||
err := syscall.Kill(-t.cmd.Process.Pid, syscall.SIGINT)
|
err := syscall.Kill(-t.cmd.Process.Pid, syscall.SIGINT)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("Issue stopping IAP tunnel: %s", err)
|
log.Printf("Issue stopping IAP tunnel: %s", err)
|
||||||
|
|
Loading…
Reference in New Issue