Test pods going missing while they're running in legacy providers (#759)

We poll legacy providers for their pod(s) status periodically. This is because we have no way of knowing when the pod is updated. If the pod somehow goes missing in the provider, that state must be handled. Currently, we update API server, and mark the pod as failed, or ignore it.
2019-09-04 14:16:14 -07:00
parent 33df981904
commit da57373abb
2 changed files with 101 additions and 6 deletions
--- a/node/lifecycle_test.go
+++ b/node/lifecycle_test.go
@@ -224,6 +224,15 @@ func TestPodLifecycle(t *testing.T) {
 			}))
 		})
 	})
+
+	// podStatusMissingWhileRunningScenario waits for the pod to go into the running state, with a V0 style provider,
+	// and then makes the pod disappear!
+	t.Run("podStatusMissingWhileRunningScenario", func(t *testing.T) {
+		mp := newMockV0Provider()
+		assert.NilError(t, wireUpSystem(ctx, mp, func(ctx context.Context, s *system) {
+			testPodStatusMissingWhileRunningScenario(ctx, t, s, mp)
+		}))
+	})
 }

 type testFunction func(ctx context.Context, s *system)
@@ -547,6 +556,87 @@ func testUpdatePodWhileRunningScenario(ctx context.Context, t *testing.T, s *sys
 	assert.NilError(t, m.updates.until(ctx, func(v int) bool { return v > 0 }))
 }

+func testPodStatusMissingWhileRunningScenario(ctx context.Context, t *testing.T, s *system, m *mockV0Provider) {
+	t.Parallel()
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	p := newPod()
+	key, err := buildKey(p)
+	assert.NilError(t, err)
+
+	listOptions := metav1.ListOptions{
+		FieldSelector: fields.OneTermEqualSelector("metadata.name", p.ObjectMeta.Name).String(),
+	}
+
+	watchErrCh := make(chan error)
+
+	// Create a Pod
+	_, e := s.client.CoreV1().Pods(testNamespace).Create(p)
+	assert.NilError(t, e)
+
+	// Setup a watch to check if the pod is in running
+	watcher, err := s.client.CoreV1().Pods(testNamespace).Watch(listOptions)
+	assert.NilError(t, err)
+	defer watcher.Stop()
+	go func() {
+		newPod, watchErr := watchutils.UntilWithoutRetry(ctx, watcher,
+			// Wait for the pod to be started
+			func(ev watch.Event) (bool, error) {
+				pod := ev.Object.(*corev1.Pod)
+				return pod.Status.Phase == corev1.PodRunning, nil
+			})
+		// This deepcopy is required to please the race detector
+		p = newPod.Object.(*corev1.Pod).DeepCopy()
+		watchErrCh <- watchErr
+	}()
+
+	// Start the pod controller
+	podControllerErrCh := s.start(ctx)
+
+	// Wait for pod to be in running
+	select {
+	case <-ctx.Done():
+		t.Fatalf("Context ended early: %s", ctx.Err().Error())
+	case err = <-podControllerErrCh:
+		assert.NilError(t, err)
+		t.Fatal("Pod controller exited prematurely without error")
+	case err = <-watchErrCh:
+		assert.NilError(t, err)
+
+	}
+
+	// Setup a watch to check if the pod is in failed due to provider issues
+	watcher, err = s.client.CoreV1().Pods(testNamespace).Watch(listOptions)
+	assert.NilError(t, err)
+	defer watcher.Stop()
+	go func() {
+		newPod, watchErr := watchutils.UntilWithoutRetry(ctx, watcher,
+			// Wait for the pod to be failed
+			func(ev watch.Event) (bool, error) {
+				pod := ev.Object.(*corev1.Pod)
+				return pod.Status.Phase == corev1.PodFailed, nil
+			})
+		// This deepcopy is required to please the race detector
+		p = newPod.Object.(*corev1.Pod).DeepCopy()
+		watchErrCh <- watchErr
+	}()
+
+	// delete the pod from the mock provider
+	m.pods.Delete(key)
+	select {
+	case <-ctx.Done():
+		t.Fatalf("Context ended early: %s", ctx.Err().Error())
+	case err = <-podControllerErrCh:
+		assert.NilError(t, err)
+		t.Fatal("Pod controller exited prematurely without error")
+	case err = <-watchErrCh:
+		assert.NilError(t, err)
+	}
+
+	assert.Equal(t, p.Status.Reason, podStatusReasonNotFound)
+}
+
 func BenchmarkCreatePods(b *testing.B) {
 	sl := logrus.StandardLogger()
 	sl.SetLevel(logrus.ErrorLevel)
--- a/node/pod.go
+++ b/node/pod.go
@@ -34,7 +34,12 @@ import (
 )

 const (
-	podStatusReasonProviderFailed = "ProviderFailed"
+	podStatusReasonProviderFailed   = "ProviderFailed"
+	podStatusReasonNotFound         = "NotFound"
+	podStatusMessageNotFound        = "The pod status was not found and may have been deleted from the provider"
+	containerStatusReasonNotFound   = "NotFound"
+	containerStatusMessageNotFound  = "Container was not found and was likely deleted"
+	containerStatusExitCodeNotFound = -137
 )

 func addPodAttributes(ctx context.Context, span trace.Span, pod *corev1.Pod) context.Context {
@@ -252,17 +257,17 @@ func (pc *PodController) fetchPodStatusFromProvider(ctx context.Context, q workq
 			// Set the pod to failed, this makes sure if the underlying container implementation is gone that a new pod will be created.
 			podStatus = podFromKubernetes.Status.DeepCopy()
 			podStatus.Phase = corev1.PodFailed
-			podStatus.Reason = "NotFound"
-			podStatus.Message = "The pod status was not found and may have been deleted from the provider"
+			podStatus.Reason = podStatusReasonNotFound
+			podStatus.Message = podStatusMessageNotFound
 			now := metav1.NewTime(time.Now())
 			for i, c := range podStatus.ContainerStatuses {
 				if c.State.Running == nil {
 					continue
 				}
 				podStatus.ContainerStatuses[i].State.Terminated = &corev1.ContainerStateTerminated{
-					ExitCode:    -137,
-					Reason:      "NotFound",
-					Message:     "Container was not found and was likely deleted",
+					ExitCode:    containerStatusExitCodeNotFound,
+					Reason:      containerStatusReasonNotFound,
+					Message:     containerStatusMessageNotFound,
 					FinishedAt:  now,
 					StartedAt:   c.State.Running.StartedAt,
 					ContainerID: c.ContainerID,