Test pods going missing while they're running in legacy providers (#759)

We poll legacy providers for their pod(s) status periodically. This is because we have no way of knowing when the pod is updated. If the pod somehow goes missing in the provider, that state must be handled. Currently, we update API server, and mark the pod as failed, or ignore it.
2019-09-04 14:16:14 -07:00
parent 33df981904
commit da57373abb
2 changed files with 101 additions and 6 deletions
--- a/node/lifecycle_test.go
+++ b/node/lifecycle_test.go
@@ -224,6 +224,15 @@ func TestPodLifecycle(t *testing.T) {
 			}))
 		})
 	})
+
+	// podStatusMissingWhileRunningScenario waits for the pod to go into the running state, with a V0 style provider,
+	// and then makes the pod disappear!
+	t.Run("podStatusMissingWhileRunningScenario", func(t *testing.T) {
+		mp := newMockV0Provider()
+		assert.NilError(t, wireUpSystem(ctx, mp, func(ctx context.Context, s *system) {
+			testPodStatusMissingWhileRunningScenario(ctx, t, s, mp)
+		}))
+	})
 }

 type testFunction func(ctx context.Context, s *system)
@@ -547,6 +556,87 @@ func testUpdatePodWhileRunningScenario(ctx context.Context, t *testing.T, s *sys
 	assert.NilError(t, m.updates.until(ctx, func(v int) bool { return v > 0 }))
 }

+func testPodStatusMissingWhileRunningScenario(ctx context.Context, t *testing.T, s *system, m *mockV0Provider) {
+	t.Parallel()
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	p := newPod()
+	key, err := buildKey(p)
+	assert.NilError(t, err)
+
+	listOptions := metav1.ListOptions{
+		FieldSelector: fields.OneTermEqualSelector("metadata.name", p.ObjectMeta.Name).String(),
+	}
+
+	watchErrCh := make(chan error)
+
+	// Create a Pod
+	_, e := s.client.CoreV1().Pods(testNamespace).Create(p)
+	assert.NilError(t, e)
+
+	// Setup a watch to check if the pod is in running
+	watcher, err := s.client.CoreV1().Pods(testNamespace).Watch(listOptions)
+	assert.NilError(t, err)
+	defer watcher.Stop()
+	go func() {
+		newPod, watchErr := watchutils.UntilWithoutRetry(ctx, watcher,
+			// Wait for the pod to be started
+			func(ev watch.Event) (bool, error) {
+				pod := ev.Object.(*corev1.Pod)
+				return pod.Status.Phase == corev1.PodRunning, nil
+			})
+		// This deepcopy is required to please the race detector
+		p = newPod.Object.(*corev1.Pod).DeepCopy()
+		watchErrCh <- watchErr
+	}()
+
+	// Start the pod controller
+	podControllerErrCh := s.start(ctx)
+
+	// Wait for pod to be in running
+	select {
+	case <-ctx.Done():
+		t.Fatalf("Context ended early: %s", ctx.Err().Error())
+	case err = <-podControllerErrCh:
+		assert.NilError(t, err)
+		t.Fatal("Pod controller exited prematurely without error")
+	case err = <-watchErrCh:
+		assert.NilError(t, err)
+
+	}
+
+	// Setup a watch to check if the pod is in failed due to provider issues
+	watcher, err = s.client.CoreV1().Pods(testNamespace).Watch(listOptions)
+	assert.NilError(t, err)
+	defer watcher.Stop()
+	go func() {
+		newPod, watchErr := watchutils.UntilWithoutRetry(ctx, watcher,
+			// Wait for the pod to be failed
+			func(ev watch.Event) (bool, error) {
+				pod := ev.Object.(*corev1.Pod)
+				return pod.Status.Phase == corev1.PodFailed, nil
+			})
+		// This deepcopy is required to please the race detector
+		p = newPod.Object.(*corev1.Pod).DeepCopy()
+		watchErrCh <- watchErr
+	}()
+
+	// delete the pod from the mock provider
+	m.pods.Delete(key)
+	select {
+	case <-ctx.Done():
+		t.Fatalf("Context ended early: %s", ctx.Err().Error())
+	case err = <-podControllerErrCh:
+		assert.NilError(t, err)
+		t.Fatal("Pod controller exited prematurely without error")
+	case err = <-watchErrCh:
+		assert.NilError(t, err)
+	}
+
+	assert.Equal(t, p.Status.Reason, podStatusReasonNotFound)
+}
+
 func BenchmarkCreatePods(b *testing.B) {
 	sl := logrus.StandardLogger()
 	sl.SetLevel(logrus.ErrorLevel)