Adds Done() and Err() to pod controller (#735)
Allows callers to wait for pod controller exit in addition to readiness. This means the caller does not have to deal handling errors from the pod controller running in a gorutine since it can wait for exit via `Done()` and check the error with `Err()`
This commit is contained in:
@@ -27,9 +27,10 @@ There are two primary controllers, the node runner and the pod runner.
|
||||
|
||||
select {
|
||||
case <-podRunner.Ready():
|
||||
go nodeRunner.Run(ctx)
|
||||
case <-ctx.Done()
|
||||
return ctx.Err()
|
||||
case <-podRunner.Done():
|
||||
}
|
||||
if podRunner.Err() != nil {
|
||||
// handle error
|
||||
}
|
||||
|
||||
After calling start, cancelling the passed in context will shutdown the
|
||||
|
||||
@@ -237,33 +237,20 @@ func TestPodLifecycle(t *testing.T) {
|
||||
|
||||
type testFunction func(ctx context.Context, s *system)
|
||||
type system struct {
|
||||
retChan chan error
|
||||
pc *PodController
|
||||
client *fake.Clientset
|
||||
podControllerConfig PodControllerConfig
|
||||
}
|
||||
|
||||
func (s *system) start(ctx context.Context) chan error {
|
||||
podControllerErrChan := make(chan error)
|
||||
go func() {
|
||||
podControllerErrChan <- s.pc.Run(ctx, podSyncWorkers)
|
||||
}()
|
||||
|
||||
// We need to wait for the pod controller to start. If there is an error before the pod controller starts, or
|
||||
// the context is cancelled. If the context is cancelled, the startup will be aborted, and the pod controller
|
||||
// will return an error, so we don't need to wait on ctx.Done()
|
||||
func (s *system) start(ctx context.Context) error {
|
||||
go s.pc.Run(ctx, podSyncWorkers) // nolint:errcheck
|
||||
select {
|
||||
case <-s.pc.Ready():
|
||||
// This listens for errors, or exits in the future.
|
||||
go func() {
|
||||
podControllerErr := <-podControllerErrChan
|
||||
s.retChan <- podControllerErr
|
||||
}()
|
||||
// If there is an error before things are ready, we need to forward it immediately
|
||||
case podControllerErr := <-podControllerErrChan:
|
||||
s.retChan <- podControllerErr
|
||||
case <-s.pc.Done():
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
return s.retChan
|
||||
return s.pc.Err()
|
||||
}
|
||||
|
||||
func wireUpSystem(ctx context.Context, provider PodLifecycleHandler, f testFunction) error {
|
||||
@@ -305,8 +292,7 @@ func wireUpSystem(ctx context.Context, provider PodLifecycleHandler, f testFunct
|
||||
configMapInformer := sharedInformerFactory.Core().V1().ConfigMaps()
|
||||
serviceInformer := sharedInformerFactory.Core().V1().Services()
|
||||
sys := &system{
|
||||
client: client,
|
||||
retChan: make(chan error, 1),
|
||||
client: client,
|
||||
podControllerConfig: PodControllerConfig{
|
||||
PodClient: client.CoreV1(),
|
||||
PodInformer: podInformer,
|
||||
@@ -338,7 +324,7 @@ func wireUpSystem(ctx context.Context, provider PodLifecycleHandler, f testFunct
|
||||
|
||||
// Shutdown the pod controller, and wait for it to exit
|
||||
cancel()
|
||||
return <-sys.retChan
|
||||
return nil
|
||||
}
|
||||
|
||||
func testFailedPodScenario(ctx context.Context, t *testing.T, s *system) {
|
||||
@@ -359,7 +345,7 @@ func testTerminalStatePodScenario(ctx context.Context, t *testing.T, s *system,
|
||||
assert.NilError(t, e)
|
||||
|
||||
// Start the pod controller
|
||||
s.start(ctx)
|
||||
assert.NilError(t, s.start(ctx))
|
||||
|
||||
for s.pc.k8sQ.Len() > 0 {
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
@@ -379,7 +365,7 @@ func testDanglingPodScenario(ctx context.Context, t *testing.T, s *system, m *mo
|
||||
assert.NilError(t, m.CreatePod(ctx, pod))
|
||||
|
||||
// Start the pod controller
|
||||
s.start(ctx)
|
||||
assert.NilError(t, s.start(ctx))
|
||||
|
||||
assert.Assert(t, is.Equal(m.deletes.read(), 1))
|
||||
|
||||
@@ -444,8 +430,7 @@ func testCreateStartDeleteScenario(ctx context.Context, t *testing.T, s *system,
|
||||
watchErrCh <- watchErr
|
||||
}()
|
||||
|
||||
// Start the pod controller
|
||||
podControllerErrCh := s.start(ctx)
|
||||
assert.NilError(t, s.start(ctx))
|
||||
|
||||
// Wait for the pod to go into running
|
||||
select {
|
||||
@@ -453,9 +438,6 @@ func testCreateStartDeleteScenario(ctx context.Context, t *testing.T, s *system,
|
||||
t.Fatalf("Context ended early: %s", ctx.Err().Error())
|
||||
case err = <-watchErrCh:
|
||||
assert.NilError(t, err)
|
||||
case err = <-podControllerErrCh:
|
||||
assert.NilError(t, err)
|
||||
t.Fatal("Pod controller terminated early")
|
||||
}
|
||||
|
||||
// Setup a watch prior to pod deletion
|
||||
@@ -483,12 +465,8 @@ func testCreateStartDeleteScenario(ctx context.Context, t *testing.T, s *system,
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("Context ended early: %s", ctx.Err().Error())
|
||||
case err = <-podControllerErrCh:
|
||||
assert.NilError(t, err)
|
||||
t.Fatal("Pod controller exited prematurely without error")
|
||||
case err = <-watchErrCh:
|
||||
assert.NilError(t, err)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -526,18 +504,14 @@ func testUpdatePodWhileRunningScenario(ctx context.Context, t *testing.T, s *sys
|
||||
}()
|
||||
|
||||
// Start the pod controller
|
||||
podControllerErrCh := s.start(ctx)
|
||||
assert.NilError(t, s.start(ctx))
|
||||
|
||||
// Wait for pod to be in running
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("Context ended early: %s", ctx.Err().Error())
|
||||
case err = <-podControllerErrCh:
|
||||
assert.NilError(t, err)
|
||||
t.Fatal("Pod controller exited prematurely without error")
|
||||
case err = <-watchErrCh:
|
||||
assert.NilError(t, err)
|
||||
|
||||
}
|
||||
|
||||
// Update the pod
|
||||
@@ -592,14 +566,14 @@ func testPodStatusMissingWhileRunningScenario(ctx context.Context, t *testing.T,
|
||||
}()
|
||||
|
||||
// Start the pod controller
|
||||
podControllerErrCh := s.start(ctx)
|
||||
assert.NilError(t, s.start(ctx))
|
||||
|
||||
// Wait for pod to be in running
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("Context ended early: %s", ctx.Err().Error())
|
||||
case err = <-podControllerErrCh:
|
||||
assert.NilError(t, err)
|
||||
case <-s.pc.Done():
|
||||
assert.NilError(t, s.pc.Err())
|
||||
t.Fatal("Pod controller exited prematurely without error")
|
||||
case err = <-watchErrCh:
|
||||
assert.NilError(t, err)
|
||||
@@ -627,8 +601,8 @@ func testPodStatusMissingWhileRunningScenario(ctx context.Context, t *testing.T,
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
t.Fatalf("Context ended early: %s", ctx.Err().Error())
|
||||
case err = <-podControllerErrCh:
|
||||
assert.NilError(t, err)
|
||||
case <-s.pc.Done():
|
||||
assert.NilError(t, s.pc.Err())
|
||||
t.Fatal("Pod controller exited prematurely without error")
|
||||
case err = <-watchErrCh:
|
||||
assert.NilError(t, err)
|
||||
@@ -654,18 +628,14 @@ func benchmarkCreatePods(ctx context.Context, b *testing.B, s *system) {
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
errCh := s.start(ctx)
|
||||
assert.NilError(b, s.start(ctx))
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
pod := newPod(randomizeUID, randomizeName)
|
||||
_, err := s.client.CoreV1().Pods(pod.Namespace).Create(pod)
|
||||
assert.NilError(b, err)
|
||||
select {
|
||||
case err = <-errCh:
|
||||
b.Fatalf("Benchmark terminated with error: %+v", err)
|
||||
default:
|
||||
}
|
||||
assert.NilError(b, ctx.Err())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ package node
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
pkgerrors "github.com/pkg/errors"
|
||||
"github.com/virtual-kubelet/virtual-kubelet/errdefs"
|
||||
@@ -26,7 +27,9 @@ import (
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
kubeinformers "k8s.io/client-go/informers"
|
||||
"k8s.io/client-go/kubernetes/fake"
|
||||
"k8s.io/client-go/util/workqueue"
|
||||
)
|
||||
|
||||
type TestController struct {
|
||||
@@ -40,19 +43,29 @@ func newTestController() *TestController {
|
||||
|
||||
rm := testutil.FakeResourceManager()
|
||||
p := newMockProvider()
|
||||
|
||||
iFactory := kubeinformers.NewSharedInformerFactoryWithOptions(fk8s, 10*time.Minute)
|
||||
return &TestController{
|
||||
PodController: &PodController{
|
||||
client: fk8s.CoreV1(),
|
||||
provider: p,
|
||||
resourceManager: rm,
|
||||
recorder: testutil.FakeEventRecorder(5),
|
||||
k8sQ: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()),
|
||||
done: make(chan struct{}),
|
||||
ready: make(chan struct{}),
|
||||
podsInformer: iFactory.Core().V1().Pods(),
|
||||
},
|
||||
mock: p,
|
||||
client: fk8s,
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the informer and runs the pod controller
|
||||
func (tc *TestController) Run(ctx context.Context, n int) error {
|
||||
go tc.podsInformer.Informer().Run(ctx.Done())
|
||||
return tc.PodController.Run(ctx, n)
|
||||
}
|
||||
|
||||
func TestPodsEqual(t *testing.T) {
|
||||
p1 := &corev1.Pod{
|
||||
Spec: corev1.PodSpec{
|
||||
|
||||
@@ -100,10 +100,6 @@ type PodController struct {
|
||||
// recorder is an event recorder for recording Event resources to the Kubernetes API.
|
||||
recorder record.EventRecorder
|
||||
|
||||
// ready is a channel which will be closed once the pod controller is fully up and running.
|
||||
// this channel will never be closed if there is an error on startup.
|
||||
ready chan struct{}
|
||||
|
||||
client corev1client.PodsGetter
|
||||
|
||||
resourceManager *manager.ResourceManager
|
||||
@@ -113,6 +109,22 @@ type PodController struct {
|
||||
// From the time of creation, to termination the knownPods map will contain the pods key
|
||||
// (derived from Kubernetes' cache library) -> a *knownPod struct.
|
||||
knownPods sync.Map
|
||||
|
||||
// ready is a channel which will be closed once the pod controller is fully up and running.
|
||||
// this channel will never be closed if there is an error on startup.
|
||||
ready chan struct{}
|
||||
// done is closed when Run returns
|
||||
// Once done is closed `err` may be set to a non-nil value
|
||||
done chan struct{}
|
||||
|
||||
mu sync.Mutex
|
||||
// err is set if there is an error while while running the pod controller.
|
||||
// Typically this would be errors that occur during startup.
|
||||
// Once err is set, `Run` should return.
|
||||
//
|
||||
// This is used since `pc.Run()` is typically called in a goroutine and managing
|
||||
// this can be non-trivial for callers.
|
||||
err error
|
||||
}
|
||||
|
||||
type knownPod struct {
|
||||
@@ -180,6 +192,7 @@ func NewPodController(cfg PodControllerConfig) (*PodController, error) {
|
||||
provider: cfg.Provider,
|
||||
resourceManager: rm,
|
||||
ready: make(chan struct{}),
|
||||
done: make(chan struct{}),
|
||||
recorder: cfg.EventRecorder,
|
||||
k8sQ: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "syncPodsFromKubernetes"),
|
||||
}
|
||||
@@ -187,10 +200,21 @@ func NewPodController(cfg PodControllerConfig) (*PodController, error) {
|
||||
return pc, nil
|
||||
}
|
||||
|
||||
// Run will set up the event handlers for types we are interested in, as well as syncing informer caches and starting workers.
|
||||
// It will block until the context is cancelled, at which point it will shutdown the work queue and wait for workers to finish processing their current work items.
|
||||
func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) error {
|
||||
defer pc.k8sQ.ShutDown()
|
||||
// Run will set up the event handlers for types we are interested in, as well
|
||||
// as syncing informer caches and starting workers. It will block until the
|
||||
// context is cancelled, at which point it will shutdown the work queue and
|
||||
// wait for workers to finish processing their current work items.
|
||||
//
|
||||
// Once this returns, you should not re-use the controller.
|
||||
func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) (retErr error) {
|
||||
defer func() {
|
||||
pc.k8sQ.ShutDown()
|
||||
|
||||
pc.mu.Lock()
|
||||
pc.err = retErr
|
||||
close(pc.done)
|
||||
pc.mu.Unlock()
|
||||
}()
|
||||
|
||||
podStatusQueue := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "syncPodStatusFromProvider")
|
||||
pc.runSyncFromProvider(ctx, podStatusQueue)
|
||||
@@ -274,6 +298,19 @@ func (pc *PodController) Ready() <-chan struct{} {
|
||||
return pc.ready
|
||||
}
|
||||
|
||||
// Done returns a channel receiver which is closed when the pod controller has exited.
|
||||
// Once the pod controller has exited you can call `pc.Err()` to see if any error occurred.
|
||||
func (pc *PodController) Done() <-chan struct{} {
|
||||
return pc.done
|
||||
}
|
||||
|
||||
// Err returns any error that has occurred and caused the pod controller to exit.
|
||||
func (pc *PodController) Err() error {
|
||||
pc.mu.Lock()
|
||||
defer pc.mu.Unlock()
|
||||
return pc.err
|
||||
}
|
||||
|
||||
// runWorker is a long-running function that will continually call the processNextWorkItem function in order to read and process an item on the work queue.
|
||||
func (pc *PodController) runWorker(ctx context.Context, workerID string, q workqueue.RateLimitingInterface) {
|
||||
for pc.processNextWorkItem(ctx, workerID, q) {
|
||||
|
||||
44
node/podcontroller_test.go
Normal file
44
node/podcontroller_test.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package node
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gotest.tools/assert"
|
||||
)
|
||||
|
||||
func TestPodControllerExitOnContextCancel(t *testing.T) {
|
||||
tc := newTestController()
|
||||
ctx := context.Background()
|
||||
ctxRun, cancel := context.WithCancel(ctx)
|
||||
|
||||
done := make(chan error)
|
||||
go func() {
|
||||
done <- tc.Run(ctxRun, 1)
|
||||
}()
|
||||
|
||||
ctxT, cancelT := context.WithTimeout(ctx, 30*time.Second)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
assert.NilError(t, ctxT.Err())
|
||||
case <-tc.Ready():
|
||||
case <-tc.Done():
|
||||
}
|
||||
assert.NilError(t, tc.Err())
|
||||
|
||||
cancelT()
|
||||
|
||||
cancel()
|
||||
|
||||
ctxT, cancelT = context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancelT()
|
||||
|
||||
select {
|
||||
case <-ctxT.Done():
|
||||
assert.NilError(t, ctxT.Err(), "timeout waiting for Run() to exit")
|
||||
case err := <-done:
|
||||
assert.NilError(t, err)
|
||||
}
|
||||
assert.NilError(t, tc.Err())
|
||||
}
|
||||
Reference in New Issue
Block a user