Move node pinging to its own goroutine

This moves the job of pinging the node provider into its own
goroutine. If it takes a long time, it shouldn't slow down
leases, and vice-versa.

It also adds timeouts for node pings. One of the problems
is that we don't know how long a node ping will take --
there could be a bunch of network calls underneath us.

The point of the lease is to say whether or not the
Kubelet is unreachable, not whether or not the node
pings are "passing".

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
This commit is contained in:
Sargun Dhillon
2020-07-27 20:59:10 -07:00
parent 49c596c5ca
commit d390dfce43
4 changed files with 208 additions and 6 deletions

View File

@@ -0,0 +1,104 @@
package node
import (
"context"
"sync"
"time"
"github.com/virtual-kubelet/virtual-kubelet/log"
"github.com/virtual-kubelet/virtual-kubelet/trace"
"golang.org/x/sync/singleflight"
"k8s.io/apimachinery/pkg/util/wait"
)
type nodePingController struct {
nodeProvider NodeProvider
pingInterval time.Duration
firstPingCompleted chan struct{}
pingTimeout *time.Duration
// "Results"
sync.Mutex
result pingResult
}
type pingResult struct {
pingTime time.Time
error error
}
func newNodePingController(node NodeProvider, pingInterval time.Duration, timeout *time.Duration) *nodePingController {
return &nodePingController{
nodeProvider: node,
pingInterval: pingInterval,
firstPingCompleted: make(chan struct{}),
pingTimeout: timeout,
}
}
func (npc *nodePingController) run(ctx context.Context) {
const key = "key"
sf := &singleflight.Group{}
// 1. If the node is "stuck" and not responding to pings, we want to set the status
// to that the node provider has timed out responding to pings
// 2. We want it so that the context is cancelled, and whatever the node might have
// been stuck on uses context so it might be unstuck
// 3. We want to retry pinging the node, but we do not ever want more than one
// ping in flight at a time.
mkContextFunc := context.WithCancel
if npc.pingTimeout != nil {
mkContextFunc = func(ctx2 context.Context) (context.Context, context.CancelFunc) {
return context.WithTimeout(ctx2, *npc.pingTimeout)
}
}
checkFunc := func(ctx context.Context) {
ctx, cancel := mkContextFunc(ctx)
defer cancel()
ctx, span := trace.StartSpan(ctx, "node.pingLoop")
defer span.End()
doChan := sf.DoChan(key, func() (interface{}, error) {
now := time.Now()
ctx, span := trace.StartSpan(ctx, "node.pingNode")
defer span.End()
err := npc.nodeProvider.Ping(ctx)
span.SetStatus(err)
return now, err
})
var pingResult pingResult
select {
case <-ctx.Done():
pingResult.error = ctx.Err()
log.G(ctx).WithError(pingResult.error).Warn("Failed to ping node due to context cancellation")
case result := <-doChan:
pingResult.error = result.Err
pingResult.pingTime = result.Val.(time.Time)
}
npc.Lock()
npc.result = pingResult
defer npc.Unlock()
span.SetStatus(pingResult.error)
}
// Run the first check manually
checkFunc(ctx)
close(npc.firstPingCompleted)
wait.UntilWithContext(ctx, checkFunc, npc.pingInterval)
}
func (npc *nodePingController) getResult(ctx context.Context) (*pingResult, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-npc.firstPingCompleted:
}
return &npc.result, nil
}