use shared informers and workqueue (#425)
* vendor: add vendored code
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* controller: use shared informers and a work queue
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* errors: use cpuguy83/strongerrors
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* aci: fix test that uses resource manager
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* readme: clarify skaffold run before e2e
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* cmd: use root context everywhere
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: refactor pod lifecycle management
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* e2e: fix race in test when observing deletions
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* e2e: test pod forced deletion
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* cmd: fix root context potential leak
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: rename metaKey
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: remove calls to HandleError
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* Revert "errors: use cpuguy83/strongerrors"
This reverts commit f031fc6d.
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* manager: remove redundant lister constraint
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: rename the pod event recorder
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: amend misleading comment
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* mock: add tracing
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: add tracing
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* test: observe timeouts
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* trace: remove unnecessary comments
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: limit concurrency in deleteDanglingPods
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: never store context, always pass in calls
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: remove HandleCrash and just panic
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: don't sync succeeded pods
Signed-off-by: Paulo Pires <pjpires@gmail.com>
* sync: ensure pod deletion from kubernetes
Signed-off-by: Paulo Pires <pjpires@gmail.com>
This commit is contained in:
committed by
Robbie Zhang
parent
0e9cfca585
commit
28a757f4da
@@ -6,12 +6,14 @@ import (
|
||||
"time"
|
||||
|
||||
pkgerrors "github.com/pkg/errors"
|
||||
"go.opencensus.io/trace"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
corev1informers "k8s.io/client-go/informers/core/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
|
||||
"github.com/virtual-kubelet/virtual-kubelet/log"
|
||||
"github.com/virtual-kubelet/virtual-kubelet/manager"
|
||||
"github.com/virtual-kubelet/virtual-kubelet/providers"
|
||||
"go.opencensus.io/trace"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -28,6 +30,7 @@ type Server struct {
|
||||
resourceManager *manager.ResourceManager
|
||||
podSyncWorkers int
|
||||
podCh chan *podNotification
|
||||
podInformer corev1informers.PodInformer
|
||||
}
|
||||
|
||||
// Config is used to configure a new server.
|
||||
@@ -41,6 +44,7 @@ type Config struct {
|
||||
ResourceManager *manager.ResourceManager
|
||||
Taint *corev1.Taint
|
||||
PodSyncWorkers int
|
||||
PodInformer corev1informers.PodInformer
|
||||
}
|
||||
|
||||
// APIConfig is used to configure the API server of the virtual kubelet.
|
||||
@@ -66,6 +70,7 @@ func New(ctx context.Context, cfg Config) (s *Server, retErr error) {
|
||||
provider: cfg.Provider,
|
||||
podSyncWorkers: cfg.PodSyncWorkers,
|
||||
podCh: make(chan *podNotification, cfg.PodSyncWorkers),
|
||||
podInformer: cfg.PodInformer,
|
||||
}
|
||||
|
||||
ctx = log.WithLogger(ctx, log.G(ctx))
|
||||
@@ -120,127 +125,7 @@ func New(ctx context.Context, cfg Config) (s *Server, retErr error) {
|
||||
return s, nil
|
||||
}
|
||||
|
||||
// Run starts the server, registers it with Kubernetes and begins watching/reconciling the cluster.
|
||||
// Run will block until Stop is called or a SIGINT or SIGTERM signal is received.
|
||||
// Run creates and starts an instance of the pod controller, blocking until it stops.
|
||||
func (s *Server) Run(ctx context.Context) error {
|
||||
if err := s.watchForPodEvent(ctx); err != nil {
|
||||
if pkgerrors.Cause(err) == context.Canceled {
|
||||
return err
|
||||
}
|
||||
log.G(ctx).Error(err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// reconcile is the main reconciliation loop that compares differences between Kubernetes and
|
||||
// the active provider and reconciles the differences.
|
||||
func (s *Server) reconcile(ctx context.Context) {
|
||||
ctx, span := trace.StartSpan(ctx, "reconcile")
|
||||
defer span.End()
|
||||
|
||||
logger := log.G(ctx)
|
||||
logger.Debug("Start reconcile")
|
||||
defer logger.Debug("End reconcile")
|
||||
|
||||
providerPods, err := s.provider.GetPods(ctx)
|
||||
if err != nil {
|
||||
logger.WithError(err).Error("Error getting pod list from provider")
|
||||
return
|
||||
}
|
||||
|
||||
var deletePods []*corev1.Pod
|
||||
for _, pod := range providerPods {
|
||||
// Delete pods that don't exist in Kubernetes
|
||||
if p := s.resourceManager.GetPod(pod.Namespace, pod.Name); p == nil || p.DeletionTimestamp != nil {
|
||||
deletePods = append(deletePods, pod)
|
||||
}
|
||||
}
|
||||
span.Annotate(nil, "Got provider pods")
|
||||
|
||||
var failedDeleteCount int64
|
||||
for _, pod := range deletePods {
|
||||
logger := logger.WithField("pod", pod.GetName()).WithField("namespace", pod.GetNamespace())
|
||||
logger.Debug("Deleting pod")
|
||||
if err := s.deletePod(ctx, pod); err != nil {
|
||||
logger.WithError(err).Error("Error deleting pod")
|
||||
failedDeleteCount++
|
||||
continue
|
||||
}
|
||||
}
|
||||
span.Annotate(
|
||||
[]trace.Attribute{
|
||||
trace.Int64Attribute("expected_delete_pods_count", int64(len(deletePods))),
|
||||
trace.Int64Attribute("failed_delete_pods_count", failedDeleteCount),
|
||||
},
|
||||
"Cleaned up stale provider pods",
|
||||
)
|
||||
|
||||
pods := s.resourceManager.GetPods()
|
||||
|
||||
var createPods []*corev1.Pod
|
||||
cleanupPods := deletePods[:0]
|
||||
|
||||
for _, pod := range pods {
|
||||
var providerPod *corev1.Pod
|
||||
for _, p := range providerPods {
|
||||
if p.Namespace == pod.Namespace && p.Name == pod.Name {
|
||||
providerPod = p
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Delete pod if DeletionTimestamp is set
|
||||
if pod.DeletionTimestamp != nil {
|
||||
cleanupPods = append(cleanupPods, pod)
|
||||
continue
|
||||
}
|
||||
|
||||
if providerPod == nil &&
|
||||
pod.DeletionTimestamp == nil &&
|
||||
pod.Status.Phase != corev1.PodSucceeded &&
|
||||
pod.Status.Phase != corev1.PodFailed &&
|
||||
pod.Status.Reason != podStatusReasonProviderFailed {
|
||||
createPods = append(createPods, pod)
|
||||
}
|
||||
}
|
||||
|
||||
var failedCreateCount int64
|
||||
for _, pod := range createPods {
|
||||
logger := logger.WithField("pod", pod.Name)
|
||||
logger.Debug("Creating pod")
|
||||
if err := s.createPod(ctx, pod); err != nil {
|
||||
failedCreateCount++
|
||||
logger.WithError(err).Error("Error creating pod")
|
||||
continue
|
||||
}
|
||||
}
|
||||
span.Annotate(
|
||||
[]trace.Attribute{
|
||||
trace.Int64Attribute("expected_created_pods", int64(len(createPods))),
|
||||
trace.Int64Attribute("failed_pod_creates", failedCreateCount),
|
||||
},
|
||||
"Created pods in provider",
|
||||
)
|
||||
|
||||
var failedCleanupCount int64
|
||||
for _, pod := range cleanupPods {
|
||||
logger := logger.WithField("pod", pod.Name)
|
||||
log.Trace(logger, "Pod pending deletion")
|
||||
var err error
|
||||
if err = s.deletePod(ctx, pod); err != nil {
|
||||
logger.WithError(err).Error("Error deleting pod")
|
||||
failedCleanupCount++
|
||||
continue
|
||||
}
|
||||
log.Trace(logger, "Pod deletion complete")
|
||||
}
|
||||
|
||||
span.Annotate(
|
||||
[]trace.Attribute{
|
||||
trace.Int64Attribute("expected_cleaned_up_pods", int64(len(cleanupPods))),
|
||||
trace.Int64Attribute("cleaned_up_pod_failures", failedCleanupCount),
|
||||
},
|
||||
"Cleaned up provider pods marked for deletion",
|
||||
)
|
||||
return NewPodController(s).Run(ctx, s.podSyncWorkers)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user