From aee1fde504cdddc98d478bda3bf13c06c979f167 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Thu, 1 Nov 2018 14:02:55 -0700 Subject: [PATCH] Fix a case where provider pod status is not found Updates the pod status in Kubernetes to "Failed" when the pod status is not found from the provider. Note that currently thet most providers return `nil, nil` when a pod is not found. This works but should possibly return a typed error so we can determine if the error means not found or something else... but this works as is so I haven't changed it. --- vkubelet/pod.go | 83 ++++++++++++++++++++++++++++++++++---------- vkubelet/vkubelet.go | 4 +-- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/vkubelet/pod.go b/vkubelet/pod.go index 03d3cc613..accd00db4 100644 --- a/vkubelet/pod.go +++ b/vkubelet/pod.go @@ -5,6 +5,8 @@ import ( "fmt" "time" + "github.com/cpuguy83/strongerrors/status/ocstatus" + pkgerrors "github.com/pkg/errors" "github.com/virtual-kubelet/virtual-kubelet/log" "go.opencensus.io/trace" @@ -19,9 +21,11 @@ import ( func addPodAttributes(span *trace.Span, pod *corev1.Pod) { span.AddAttributes( - trace.StringAttribute("uid", string(pod.UID)), - trace.StringAttribute("namespace", pod.Namespace), - trace.StringAttribute("name", pod.Name), + trace.StringAttribute("uid", string(pod.GetUID())), + trace.StringAttribute("namespace", pod.GetNamespace()), + trace.StringAttribute("name", pod.GetName()), + trace.StringAttribute("phase", string(pod.Status.Phase)), + trace.StringAttribute("reason", pod.Status.Reason), ) } @@ -246,26 +250,67 @@ func (s *Server) updatePodStatuses(ctx context.Context) { default: } - if pod.Status.Phase == corev1.PodSucceeded || - pod.Status.Phase == corev1.PodFailed || - pod.Status.Reason == podStatusReasonProviderFailed { - continue - } - - status, err := s.provider.GetPodStatus(ctx, pod.Namespace, pod.Name) - if err != nil { - log.G(ctx).WithField("pod", pod.GetName()).WithField("namespace", pod.GetNamespace()).Error("Error retrieving pod status") - return - } - - // Update the pod's status - if status != nil { - pod.Status = *status - s.k8sClient.CoreV1().Pods(pod.Namespace).UpdateStatus(pod) + if err := s.updatePodStatus(ctx, pod); err != nil { + logger := log.G(ctx).WithField("pod", pod.GetName()).WithField("namespace", pod.GetNamespace()).WithField("status", pod.Status.Phase).WithField("reason", pod.Status.Reason) + logger.Error(err) } } } +func (s *Server) updatePodStatus(ctx context.Context, pod *corev1.Pod) error { + ctx, span := trace.StartSpan(ctx, "updatePodStatus") + defer span.End() + addPodAttributes(span, pod) + + if pod.Status.Phase == corev1.PodSucceeded || + pod.Status.Phase == corev1.PodFailed || + pod.Status.Reason == podStatusReasonProviderFailed { + return nil + } + + status, err := s.provider.GetPodStatus(ctx, pod.Namespace, pod.Name) + if err != nil { + span.SetStatus(ocstatus.FromError(err)) + return pkgerrors.Wrap(err, "error retreiving pod status") + } + + // Update the pod's status + if status != nil { + pod.Status = *status + } else { + // Only change the status when the pod was already up + // Only doing so when the pod was successfully running makes sure we don't run into race conditions during pod creation. + if pod.Status.Phase == corev1.PodRunning || pod.ObjectMeta.CreationTimestamp.Add(time.Minute).Before(time.Now()) { + // Set the pod to failed, this makes sure if the underlying container implementation is gone that a new pod will be created. + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "NotFound" + pod.Status.Message = "The pod status was not found and may have been deleted from the provider" + for i, c := range pod.Status.ContainerStatuses { + pod.Status.ContainerStatuses[i].State.Terminated = &corev1.ContainerStateTerminated{ + ExitCode: -137, + Reason: "NotFound", + Message: "Container was not found and was likely deleted", + FinishedAt: metav1.NewTime(time.Now()), + StartedAt: c.State.Running.StartedAt, + ContainerID: c.ContainerID, + } + pod.Status.ContainerStatuses[i].State.Running = nil + } + } + } + + if _, err := s.k8sClient.CoreV1().Pods(pod.Namespace).UpdateStatus(pod); err != nil { + span.SetStatus(ocstatus.FromError(err)) + return pkgerrors.Wrap(err, "error while updating pod status in kubernetes") + } + + span.Annotate([]trace.Attribute{ + trace.StringAttribute("new phase", string(pod.Status.Phase)), + trace.StringAttribute("new reason", pod.Status.Reason), + }, "updated pod status in kubernetes") + return nil +} + // watchForPodEvent waits for pod changes from kubernetes and updates the details accordingly in the local state. // This returns after a single pod event. func (s *Server) watchForPodEvent(ctx context.Context) error { diff --git a/vkubelet/vkubelet.go b/vkubelet/vkubelet.go index 9eb906884..7246c9656 100644 --- a/vkubelet/vkubelet.go +++ b/vkubelet/vkubelet.go @@ -160,8 +160,8 @@ func (s *Server) reconcile(ctx context.Context) { var failedDeleteCount int64 for _, pod := range deletePods { - logger := logger.WithField("pod", pod.Name) - logger.Debug("Deleting pod '%s'\n", pod.Name) + logger := logger.WithField("pod", pod.GetName()).WithField("namespace", pod.GetNamespace()) + logger.Debug("Deleting pod") if err := s.deletePod(ctx, pod); err != nil { logger.WithError(err).Error("Error deleting pod") failedDeleteCount++