Add the concept of startup timeout (#597)

This adds two concepts, where one encompasses the other.

Startup timeout
Startup timeout is how long to wait for the entire kubelet
to get into a functional state. Right now, this only waits
for the pod informer cache for the pod controllerto become
in-sync with API server, but this could be extended to other
informers (like secrets informer).

Wait For Startup
This changes the behaviour of the virtual kubelet to wait
for the pod controller to start before registering the node.

It is to avoid the race condition where the node is registered,
but we cannot actually do any pod operations.
This commit is contained in:
Sargun Dhillon
2019-05-06 09:25:00 -07:00
committed by Brian Goff
parent 74a16f7f9a
commit f1cb6a7bf6
5 changed files with 65 additions and 3 deletions

View File

@@ -78,7 +78,7 @@ func installFlags(flags *pflag.FlagSet, c *Opts) {
flags.StringVar(&c.TraceSampleRate, "trace-sample-rate", c.TraceSampleRate, "set probability of tracing samples")
flags.DurationVar(&c.InformerResyncPeriod, "full-resync-period", c.InformerResyncPeriod, "how often to perform a full resync of pods between kubernetes and the provider")
flags.DurationVar(&c.StartupTimeout, "startup-timeout", c.StartupTimeout, "How long to wait for the virtual-kubelet to start")
}
func getEnv(key, defaultValue string) string {

View File

@@ -78,6 +78,9 @@ type Opts struct {
TraceExporters []string
TraceSampleRate string
TraceConfig opencensus.TracingExporterOptions
// Startup Timeout is how long to wait for the kubelet to start
StartupTimeout time.Duration
}
// SetDefaultOpts sets default options for unset values on the passed in option struct.

View File

@@ -17,6 +17,7 @@ package root
import (
"context"
"os"
"time"
"github.com/cpuguy83/strongerrors"
"github.com/pkg/errors"
@@ -54,6 +55,9 @@ This allows users to schedule kubernetes workloads on nodes that aren't running
}
func runRootCommand(ctx context.Context, c Opts) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
if ok := providers.ValidOperatingSystems[c.OperatingSystem]; !ok {
return strongerrors.InvalidArgument(errors.Errorf("operating system %q is not supported", c.OperatingSystem))
}
@@ -166,6 +170,16 @@ func runRootCommand(ctx context.Context, c Opts) error {
}
}()
if c.StartupTimeout > 0 {
// If there is a startup timeout, it does two things:
// 1. It causes the VK to shutdown if we haven't gotten into an operational state in a time period
// 2. It prevents node advertisement from happening until we're in an operational state
err = waitForVK(ctx, c.StartupTimeout, vk)
if err != nil {
return err
}
}
go func() {
if err := node.Run(ctx); err != nil {
log.G(ctx).Fatal(err)
@@ -178,6 +192,21 @@ func runRootCommand(ctx context.Context, c Opts) error {
return nil
}
func waitForVK(ctx context.Context, time time.Duration, vk *vkubelet.Server) error {
ctx, cancel := context.WithTimeout(ctx, time)
defer cancel()
// Wait for the VK / PC close the the ready channel, or time out and return
log.G(ctx).Info("Waiting for pod controller / VK to be ready")
select {
case <-vk.Ready():
return nil
case <-ctx.Done():
return errors.Wrap(ctx.Err(), "Error while starting up VK")
}
}
func newClient(configPath string) (*kubernetes.Clientset, error) {
var config *rest.Config