Fix issue #899: Pod status out of sync after being marked as not ready by controller manager

As described in the issue, if the following sequence happens, we fail to properly update the pod status in api server: 1. Create pod in k8s 2. Provider creates the pod and syncs its status back 3. Pod in k8s ready/running, all fine 4. Virtual kubelet fails to update node status for some time for whatever reason (e.g. network connectivity issues) 5. Virtual node marked as NotReady with message: Kubelet stopped posting node status 6. kube-controller-manager of k8s, goes and marks all pods as Ready = false: 7. Virtual kubelet never sync's status of pod in provider back to k8s
2020-12-07 16:50:00 -08:00
parent 0d1f6f1625
commit de7f7dd173
2 changed files with 30 additions and 3 deletions
--- a/node/pod.go
+++ b/node/pod.go
@@ -214,11 +214,15 @@ func (pc *PodController) updatePodStatus(ctx context.Context, podFromKubernetes
 	}
 	kPod := obj.(*knownPod)
 	kPod.Lock()
+
 	podFromProvider := kPod.lastPodStatusReceivedFromProvider.DeepCopy()
-	kPod.Unlock()
 	if cmp.Equal(podFromKubernetes.Status, podFromProvider.Status) && podFromProvider.DeletionTimestamp == nil {
+		kPod.lastPodStatusUpdateSkipped = true
+		kPod.Unlock()
 		return nil
 	}
+	kPod.lastPodStatusUpdateSkipped = false
+	kPod.Unlock()
 	// Pod deleted by provider due some reasons. e.g. a K8s provider, pod created by deployment would be evicted when node is not ready.
 	// If we do not delete pod in K8s, deployment would not create a new one.
 	if podFromProvider.DeletionTimestamp != nil && podFromKubernetes.DeletionTimestamp == nil {
--- a/node/podcontroller.go
+++ b/node/podcontroller.go
@@ -146,6 +146,7 @@ type knownPod struct {
 	sync.Mutex
 	lastPodStatusReceivedFromProvider *corev1.Pod
 	lastPodUsed                       *corev1.Pod
+	lastPodStatusUpdateSkipped        bool
 }

 // PodControllerConfig is used to configure a new PodController.
@@ -303,9 +304,30 @@ func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) (retErr er
 			// At this point we know that something in .metadata or .spec has changed, so we must proceed to sync the pod.
 			if key, err := cache.MetaNamespaceKeyFunc(newPod); err != nil {
 				log.G(ctx).Error(err)
-			} else if podShouldEnqueue(oldPod, newPod) {
-				pc.k8sQ.AddRateLimited(key)
+			} else {
+				obj, ok := pc.knownPods.Load(key)
+				if !ok {
+					// Pods are only ever *added* to knownPods in the above AddFunc, and removed
+					// in the below *DeleteFunc*
+					panic("Pod not found in known pods. This should never happen.")
+				}

+				kPod := obj.(*knownPod)
+				kPod.Lock()
+				if kPod.lastPodStatusUpdateSkipped && !cmp.Equal(newPod.Status, kPod.lastPodStatusReceivedFromProvider.Status) {
+					// The last pod from the provider -> kube api server was skipped, but we see they no longer match.
+					// This means that the pod in API server was changed by someone else [this can be okay], but we skipped
+					// a status update on our side because we compared the status received from the provider to the status
+					// received from the k8s api server based on outdated information.
+					pc.podStatusQ.AddRateLimited(key)
+					// Reset this to avoid re-adding it continuously
+					kPod.lastPodStatusUpdateSkipped = false
+				}
+				kPod.Unlock()
+
+				if podShouldEnqueue(oldPod, newPod) {
+					pc.k8sQ.AddRateLimited(key)
+				}
 			}
 		},
 		DeleteFunc: func(pod interface{}) {
@@ -472,6 +494,7 @@ func (pc *PodController) syncHandler(ctx context.Context, key string) error {
 		return err

 	}
+
 	// At this point we know the Pod resource has either been created or updated (which includes being marked for deletion).
 	return pc.syncPodInProvider(ctx, pod, key)
 }