diff --git a/virtual-kubelet/commands/root/flag.go b/virtual-kubelet/commands/root/flag.go index 3a5bc74..e599fd9 100644 --- a/virtual-kubelet/commands/root/flag.go +++ b/virtual-kubelet/commands/root/flag.go @@ -78,7 +78,7 @@ func installFlags(flags *pflag.FlagSet, c *Opts) { flags.StringVar(&c.TraceSampleRate, "trace-sample-rate", c.TraceSampleRate, "set probability of tracing samples") flags.DurationVar(&c.InformerResyncPeriod, "full-resync-period", c.InformerResyncPeriod, "how often to perform a full resync of pods between kubernetes and the provider") - + flags.DurationVar(&c.StartupTimeout, "startup-timeout", c.StartupTimeout, "How long to wait for the virtual-kubelet to start") } func getEnv(key, defaultValue string) string { diff --git a/virtual-kubelet/commands/root/opts.go b/virtual-kubelet/commands/root/opts.go index 41038fa..845f404 100644 --- a/virtual-kubelet/commands/root/opts.go +++ b/virtual-kubelet/commands/root/opts.go @@ -78,6 +78,9 @@ type Opts struct { TraceExporters []string TraceSampleRate string TraceConfig opencensus.TracingExporterOptions + + // Startup Timeout is how long to wait for the kubelet to start + StartupTimeout time.Duration } // SetDefaultOpts sets default options for unset values on the passed in option struct. diff --git a/virtual-kubelet/commands/root/root.go b/virtual-kubelet/commands/root/root.go index 099e81f..caacdf6 100644 --- a/virtual-kubelet/commands/root/root.go +++ b/virtual-kubelet/commands/root/root.go @@ -17,6 +17,7 @@ package root import ( "context" "os" + "time" "github.com/cpuguy83/strongerrors" "github.com/pkg/errors" @@ -54,6 +55,9 @@ This allows users to schedule kubernetes workloads on nodes that aren't running } func runRootCommand(ctx context.Context, c Opts) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + if ok := providers.ValidOperatingSystems[c.OperatingSystem]; !ok { return strongerrors.InvalidArgument(errors.Errorf("operating system %q is not supported", c.OperatingSystem)) } @@ -166,6 +170,16 @@ func runRootCommand(ctx context.Context, c Opts) error { } }() + if c.StartupTimeout > 0 { + // If there is a startup timeout, it does two things: + // 1. It causes the VK to shutdown if we haven't gotten into an operational state in a time period + // 2. It prevents node advertisement from happening until we're in an operational state + err = waitForVK(ctx, c.StartupTimeout, vk) + if err != nil { + return err + } + } + go func() { if err := node.Run(ctx); err != nil { log.G(ctx).Fatal(err) @@ -178,6 +192,21 @@ func runRootCommand(ctx context.Context, c Opts) error { return nil } +func waitForVK(ctx context.Context, time time.Duration, vk *vkubelet.Server) error { + ctx, cancel := context.WithTimeout(ctx, time) + defer cancel() + + // Wait for the VK / PC close the the ready channel, or time out and return + log.G(ctx).Info("Waiting for pod controller / VK to be ready") + + select { + case <-vk.Ready(): + return nil + case <-ctx.Done(): + return errors.Wrap(ctx.Err(), "Error while starting up VK") + } +} + func newClient(configPath string) (*kubernetes.Clientset, error) { var config *rest.Config