Move node pinging to its own goroutine

This moves the job of pinging the node provider into its own goroutine. If it takes a long time, it shouldn't slow down leases, and vice-versa. It also adds timeouts for node pings. One of the problems is that we don't know how long a node ping will take -- there could be a bunch of network calls underneath us. The point of the lease is to say whether or not the Kubelet is unreachable, not whether or not the node pings are "passing". Signed-off-by: Sargun Dhillon <sargun@sargun.me>
2020-07-27 20:59:10 -07:00
parent 49c596c5ca
commit d390dfce43
4 changed files with 208 additions and 6 deletions
--- a/node/node_ping_controller.go
+++ b/node/node_ping_controller.go
@@ -0,0 +1,104 @@
+package node
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/virtual-kubelet/virtual-kubelet/log"
+	"github.com/virtual-kubelet/virtual-kubelet/trace"
+	"golang.org/x/sync/singleflight"
+	"k8s.io/apimachinery/pkg/util/wait"
+)
+
+type nodePingController struct {
+	nodeProvider       NodeProvider
+	pingInterval       time.Duration
+	firstPingCompleted chan struct{}
+	pingTimeout        *time.Duration
+
+	// "Results"
+	sync.Mutex
+	result pingResult
+}
+
+type pingResult struct {
+	pingTime time.Time
+	error    error
+}
+
+func newNodePingController(node NodeProvider, pingInterval time.Duration, timeout *time.Duration) *nodePingController {
+	return &nodePingController{
+		nodeProvider:       node,
+		pingInterval:       pingInterval,
+		firstPingCompleted: make(chan struct{}),
+		pingTimeout:        timeout,
+	}
+}
+
+func (npc *nodePingController) run(ctx context.Context) {
+	const key = "key"
+	sf := &singleflight.Group{}
+
+	// 1. If the node is "stuck" and not responding to pings, we want to set the status
+	//    to that the node provider has timed out responding to pings
+	// 2. We want it so that the context is cancelled, and whatever the node might have
+	//    been stuck on uses context so it might be unstuck
+	// 3. We want to retry pinging the node, but we do not ever want more than one
+	//    ping in flight at a time.
+
+	mkContextFunc := context.WithCancel
+
+	if npc.pingTimeout != nil {
+		mkContextFunc = func(ctx2 context.Context) (context.Context, context.CancelFunc) {
+			return context.WithTimeout(ctx2, *npc.pingTimeout)
+		}
+	}
+
+	checkFunc := func(ctx context.Context) {
+		ctx, cancel := mkContextFunc(ctx)
+		defer cancel()
+		ctx, span := trace.StartSpan(ctx, "node.pingLoop")
+		defer span.End()
+		doChan := sf.DoChan(key, func() (interface{}, error) {
+			now := time.Now()
+			ctx, span := trace.StartSpan(ctx, "node.pingNode")
+			defer span.End()
+			err := npc.nodeProvider.Ping(ctx)
+			span.SetStatus(err)
+			return now, err
+		})
+
+		var pingResult pingResult
+		select {
+		case <-ctx.Done():
+			pingResult.error = ctx.Err()
+			log.G(ctx).WithError(pingResult.error).Warn("Failed to ping node due to context cancellation")
+		case result := <-doChan:
+			pingResult.error = result.Err
+			pingResult.pingTime = result.Val.(time.Time)
+		}
+
+		npc.Lock()
+		npc.result = pingResult
+		defer npc.Unlock()
+		span.SetStatus(pingResult.error)
+	}
+
+	// Run the first check manually
+	checkFunc(ctx)
+
+	close(npc.firstPingCompleted)
+
+	wait.UntilWithContext(ctx, checkFunc, npc.pingInterval)
+}
+
+func (npc *nodePingController) getResult(ctx context.Context) (*pingResult, error) {
+	select {
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	case <-npc.firstPingCompleted:
+	}
+
+	return &npc.result, nil
+}