queue: Add tracing

This adds tracing throughout the queues, so we can determine what's going on.
This commit is contained in:
Sargun Dhillon
2021-02-03 09:42:45 -08:00
parent ac9a1af564
commit 3a361ebabd
4 changed files with 98 additions and 50 deletions

View File

@@ -100,39 +100,63 @@ func New(ratelimiter workqueue.RateLimiter, name string, handler ItemHandler) *Q
}
// Enqueue enqueues the key in a rate limited fashion
func (q *Queue) Enqueue(key string) {
func (q *Queue) Enqueue(ctx context.Context, key string) {
q.lock.Lock()
defer q.lock.Unlock()
q.insert(key, true, 0)
q.insert(ctx, key, true, 0)
}
// EnqueueWithoutRateLimit enqueues the key without a rate limit
func (q *Queue) EnqueueWithoutRateLimit(key string) {
func (q *Queue) EnqueueWithoutRateLimit(ctx context.Context, key string) {
q.lock.Lock()
defer q.lock.Unlock()
q.insert(key, false, 0)
q.insert(ctx, key, false, 0)
}
// Forget forgets the key
func (q *Queue) Forget(key string) {
func (q *Queue) Forget(ctx context.Context, key string) {
q.lock.Lock()
defer q.lock.Unlock()
ctx, span := trace.StartSpan(ctx, "Forget")
defer span.End()
ctx = span.WithFields(ctx, map[string]interface{}{
"queue": q.name,
"key": key,
})
if item, ok := q.itemsInQueue[key]; ok {
span.WithField(ctx, "status", "itemInQueue")
delete(q.itemsInQueue, key)
q.items.Remove(item)
return
}
if qi, ok := q.itemsBeingProcessed[key]; ok {
span.WithField(ctx, "status", "itemBeingProcessed")
qi.forget = true
return
}
span.WithField(ctx, "status", "notfound")
}
// insert inserts a new item to be processed at time time. It will not further delay items if when is later than the
// original time the item was scheduled to be processed. If when is earlier, it will "bring it forward"
func (q *Queue) insert(key string, ratelimit bool, delay time.Duration) *queueItem {
func (q *Queue) insert(ctx context.Context, key string, ratelimit bool, delay time.Duration) *queueItem {
ctx, span := trace.StartSpan(ctx, "insert")
defer span.End()
ctx = span.WithFields(ctx, map[string]interface{}{
"queue": q.name,
"key": key,
"ratelimit": ratelimit,
})
if delay > 0 {
ctx = span.WithField(ctx, "delay", delay.String())
}
defer func() {
select {
case q.wakeupCh <- struct{}{}:
@@ -142,6 +166,7 @@ func (q *Queue) insert(key string, ratelimit bool, delay time.Duration) *queueIt
// First see if the item is already being processed
if item, ok := q.itemsBeingProcessed[key]; ok {
span.WithField(ctx, "status", "itemsBeingProcessed")
when := q.clock.Now().Add(delay)
// Is the item already been redirtied?
if item.redirtiedAt.IsZero() {
@@ -157,12 +182,14 @@ func (q *Queue) insert(key string, ratelimit bool, delay time.Duration) *queueIt
// Is the item already in the queue?
if item, ok := q.itemsInQueue[key]; ok {
span.WithField(ctx, "status", "itemsInQueue")
qi := item.Value.(*queueItem)
when := q.clock.Now().Add(delay)
q.adjustPosition(qi, item, when)
return qi
}
span.WithField(ctx, "status", "added")
now := q.clock.Now()
val := &queueItem{
key: key,
@@ -175,6 +202,7 @@ func (q *Queue) insert(key string, ratelimit bool, delay time.Duration) *queueIt
panic("Non-zero delay with rate limiting not supported")
}
ratelimitDelay := q.ratelimiter.When(key)
span.WithField(ctx, "delay", ratelimitDelay.String())
val.plannedToStartWorkAt = val.plannedToStartWorkAt.Add(ratelimitDelay)
val.delayedViaRateLimit = &ratelimitDelay
} else {
@@ -213,10 +241,10 @@ func (q *Queue) adjustPosition(qi *queueItem, element *list.Element, when time.T
}
// EnqueueWithoutRateLimitWithDelay enqueues without rate limiting, but work will not start for this given delay period
func (q *Queue) EnqueueWithoutRateLimitWithDelay(key string, after time.Duration) {
func (q *Queue) EnqueueWithoutRateLimitWithDelay(ctx context.Context, key string, after time.Duration) {
q.lock.Lock()
defer q.lock.Unlock()
q.insert(key, false, after)
q.insert(ctx, key, false, after)
}
// Empty returns if the queue has no items in it
@@ -398,7 +426,7 @@ func (q *Queue) handleQueueItemObject(ctx context.Context, qi *queueItem) error
if qi.requeues+1 < MaxRetries {
// Put the item back on the work Queue to handle any transient errors.
log.G(ctx).WithError(err).Warnf("requeuing %q due to failed sync", qi.key)
newQI := q.insert(qi.key, true, 0)
newQI := q.insert(ctx, qi.key, true, 0)
newQI.requeues = qi.requeues + 1
newQI.originallyAdded = qi.originallyAdded
@@ -410,7 +438,7 @@ func (q *Queue) handleQueueItemObject(ctx context.Context, qi *queueItem) error
// We've exceeded the maximum retries or we were successful.
q.ratelimiter.Forget(qi.key)
if !qi.redirtiedAt.IsZero() {
newQI := q.insert(qi.key, qi.redirtiedWithRatelimit, time.Until(qi.redirtiedAt))
newQI := q.insert(ctx, qi.key, qi.redirtiedWithRatelimit, time.Until(qi.redirtiedAt))
newQI.addedViaRedirty = true
}

View File

@@ -36,7 +36,7 @@ func TestQueueMaxRetries(t *testing.T) {
workqueue.NewItemExponentialFailureRateLimiter(5*time.Millisecond, 10*time.Millisecond),
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
), t.Name(), handler)
wq.Enqueue("test")
wq.Enqueue(context.TODO(), "test")
for n < MaxRetries {
assert.Assert(t, wq.handleQueueItem(ctx))
@@ -53,11 +53,11 @@ func TestForget(t *testing.T) {
}
wq := New(workqueue.DefaultItemBasedRateLimiter(), t.Name(), handler)
wq.Forget("val")
wq.Forget(context.TODO(), "val")
assert.Assert(t, is.Equal(0, wq.Len()))
v := "test"
wq.EnqueueWithoutRateLimit(v)
wq.EnqueueWithoutRateLimit(context.TODO(), v)
assert.Assert(t, is.Equal(1, wq.Len()))
}
@@ -86,8 +86,8 @@ func TestQueueItemNoSleep(t *testing.T) {
})
q.lock.Lock()
q.insert("foo", false, -1*time.Hour)
q.insert("bar", false, -1*time.Hour)
q.insert(ctx, "foo", false, -1*time.Hour)
q.insert(ctx, "bar", false, -1*time.Hour)
q.lock.Unlock()
item, err := q.getNextItem(ctx)
@@ -109,8 +109,8 @@ func TestQueueItemSleep(t *testing.T) {
return nil
})
q.lock.Lock()
q.insert("foo", false, 100*time.Millisecond)
q.insert("bar", false, 100*time.Millisecond)
q.insert(ctx, "foo", false, 100*time.Millisecond)
q.insert(ctx, "bar", false, 100*time.Millisecond)
q.lock.Unlock()
item, err := q.getNextItem(ctx)
@@ -131,7 +131,7 @@ func TestQueueBackgroundAdd(t *testing.T) {
time.AfterFunc(100*time.Millisecond, func() {
q.lock.Lock()
defer q.lock.Unlock()
q.insert("foo", false, 0)
q.insert(ctx, "foo", false, 0)
})
item, err := q.getNextItem(ctx)
@@ -151,13 +151,13 @@ func TestQueueBackgroundAdvance(t *testing.T) {
})
start := time.Now()
q.lock.Lock()
q.insert("foo", false, 10*time.Second)
q.insert(ctx, "foo", false, 10*time.Second)
q.lock.Unlock()
time.AfterFunc(200*time.Millisecond, func() {
q.lock.Lock()
defer q.lock.Unlock()
q.insert("foo", false, 0)
q.insert(ctx, "foo", false, 0)
})
item, err := q.getNextItem(ctx)
@@ -178,14 +178,14 @@ func TestQueueRedirty(t *testing.T) {
q = New(workqueue.DefaultItemBasedRateLimiter(), t.Name(), func(ctx context.Context, key string) error {
assert.Assert(t, is.Equal(key, "foo"))
if atomic.AddInt64(&times, 1) == 1 {
q.EnqueueWithoutRateLimit("foo")
q.EnqueueWithoutRateLimit(context.TODO(), "foo")
} else {
cancel()
}
return nil
})
q.EnqueueWithoutRateLimit("foo")
q.EnqueueWithoutRateLimit(context.TODO(), "foo")
q.Run(ctx, 1)
for !q.Empty() {
time.Sleep(100 * time.Millisecond)
@@ -207,7 +207,7 @@ func TestHeapConcurrency(t *testing.T) {
return nil
})
for i := 0; i < 20; i++ {
q.EnqueueWithoutRateLimit(strconv.Itoa(i))
q.EnqueueWithoutRateLimit(context.TODO(), strconv.Itoa(i))
}
assert.Assert(t, q.Len() == 20)
@@ -241,20 +241,20 @@ func TestHeapOrder(t *testing.T) {
})
q.clock = nonmovingClock{}
q.EnqueueWithoutRateLimitWithDelay("a", 1000)
q.EnqueueWithoutRateLimitWithDelay("b", 2000)
q.EnqueueWithoutRateLimitWithDelay("c", 3000)
q.EnqueueWithoutRateLimitWithDelay("d", 4000)
q.EnqueueWithoutRateLimitWithDelay("e", 5000)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "a", 1000)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "b", 2000)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "c", 3000)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "d", 4000)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "e", 5000)
checkConsistency(t, q)
t.Logf("%v", q)
q.EnqueueWithoutRateLimitWithDelay("d", 1000)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "d", 1000)
checkConsistency(t, q)
t.Logf("%v", q)
q.EnqueueWithoutRateLimitWithDelay("c", 1001)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "c", 1001)
checkConsistency(t, q)
t.Logf("%v", q)
q.EnqueueWithoutRateLimitWithDelay("e", 999)
q.EnqueueWithoutRateLimitWithDelay(context.TODO(), "e", 999)
checkConsistency(t, q)
t.Logf("%v", q)
}
@@ -316,7 +316,7 @@ func TestRateLimiter(t *testing.T) {
enqueued := 0
syncMap.Range(func(key, value interface{}) bool {
enqueued++
q.Enqueue(key.(string))
q.Enqueue(context.TODO(), key.(string))
return true
})
@@ -369,11 +369,11 @@ func TestQueueForgetInProgress(t *testing.T) {
q = New(workqueue.DefaultItemBasedRateLimiter(), t.Name(), func(ctx context.Context, key string) error {
assert.Assert(t, is.Equal(key, "foo"))
atomic.AddInt64(&times, 1)
q.Forget(key)
q.Forget(context.TODO(), key)
return errors.New("test")
})
q.EnqueueWithoutRateLimit("foo")
q.EnqueueWithoutRateLimit(context.TODO(), "foo")
go q.Run(ctx, 1)
for !q.Empty() {
time.Sleep(100 * time.Millisecond)
@@ -390,8 +390,8 @@ func TestQueueForgetBeforeStart(t *testing.T) {
panic("shouldn't be called")
})
q.EnqueueWithoutRateLimit("foo")
q.Forget("foo")
q.EnqueueWithoutRateLimit(context.TODO(), "foo")
q.Forget(context.TODO(), "foo")
go q.Run(ctx, 1)
for !q.Empty() {
time.Sleep(100 * time.Millisecond)
@@ -401,26 +401,28 @@ func TestQueueForgetBeforeStart(t *testing.T) {
func TestQueueMoveItem(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
q := New(workqueue.DefaultItemBasedRateLimiter(), t.Name(), func(ctx context.Context, key string) error {
panic("shouldn't be called")
})
q.clock = nonmovingClock{}
q.insert("foo", false, 3000)
q.insert("bar", false, 2000)
q.insert("baz", false, 1000)
q.insert(ctx, "foo", false, 3000)
q.insert(ctx, "bar", false, 2000)
q.insert(ctx, "baz", false, 1000)
checkConsistency(t, q)
t.Log(q)
q.insert("foo", false, 2000)
q.insert(ctx, "foo", false, 2000)
checkConsistency(t, q)
t.Log(q)
q.insert("foo", false, 1999)
q.insert(ctx, "foo", false, 1999)
checkConsistency(t, q)
t.Log(q)
q.insert("foo", false, 999)
q.insert(ctx, "foo", false, 999)
checkConsistency(t, q)
t.Log(q)
}

View File

@@ -330,7 +330,7 @@ func (pc *PodController) enqueuePodStatusUpdate(ctx context.Context, pod *corev1
}
kpod.lastPodStatusReceivedFromProvider = pod
kpod.Unlock()
pc.syncPodStatusFromProvider.Enqueue(key)
pc.syncPodStatusFromProvider.Enqueue(ctx, key)
}
func (pc *PodController) syncPodStatusFromProviderHandler(ctx context.Context, key string) (retErr error) {

View File

@@ -302,14 +302,25 @@ func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) (retErr er
var eventHandler cache.ResourceEventHandler = cache.ResourceEventHandlerFuncs{
AddFunc: func(pod interface{}) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
ctx, span := trace.StartSpan(ctx, "AddFunc")
defer span.End()
if key, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
log.G(ctx).Error(err)
} else {
ctx = span.WithField(ctx, "key", key)
pc.knownPods.Store(key, &knownPod{})
pc.syncPodsFromKubernetes.Enqueue(key)
pc.syncPodsFromKubernetes.Enqueue(ctx, key)
}
},
UpdateFunc: func(oldObj, newObj interface{}) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
ctx, span := trace.StartSpan(ctx, "UpdateFunc")
defer span.End()
// Create a copy of the old and new pod objects so we don't mutate the cache.
oldPod := oldObj.(*corev1.Pod)
newPod := newObj.(*corev1.Pod)
@@ -318,6 +329,7 @@ func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) (retErr er
if key, err := cache.MetaNamespaceKeyFunc(newPod); err != nil {
log.G(ctx).Error(err)
} else {
ctx = span.WithField(ctx, "key", key)
obj, ok := pc.knownPods.Load(key)
if !ok {
// Pods are only ever *added* to knownPods in the above AddFunc, and removed
@@ -332,25 +344,31 @@ func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) (retErr er
// This means that the pod in API server was changed by someone else [this can be okay], but we skipped
// a status update on our side because we compared the status received from the provider to the status
// received from the k8s api server based on outdated information.
pc.syncPodStatusFromProvider.Enqueue(key)
pc.syncPodStatusFromProvider.Enqueue(ctx, key)
// Reset this to avoid re-adding it continuously
kPod.lastPodStatusUpdateSkipped = false
}
kPod.Unlock()
if podShouldEnqueue(oldPod, newPod) {
pc.syncPodsFromKubernetes.Enqueue(key)
pc.syncPodsFromKubernetes.Enqueue(ctx, key)
}
}
},
DeleteFunc: func(pod interface{}) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
ctx, span := trace.StartSpan(ctx, "DeleteFunc")
defer span.End()
if key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(pod); err != nil {
log.G(ctx).Error(err)
} else {
ctx = span.WithField(ctx, "key", key)
pc.knownPods.Delete(key)
pc.syncPodsFromKubernetes.Enqueue(key)
pc.syncPodsFromKubernetes.Enqueue(ctx, key)
// If this pod was in the deletion queue, forget about it
pc.deletePodsFromKubernetes.Forget(key)
pc.deletePodsFromKubernetes.Forget(ctx, key)
}
},
}
@@ -482,7 +500,7 @@ func (pc *PodController) syncPodInProvider(ctx context.Context, pod *corev1.Pod,
// more context is here: https://github.com/virtual-kubelet/virtual-kubelet/pull/760
if pod.DeletionTimestamp != nil && !running(&pod.Status) {
log.G(ctx).Debug("Force deleting pod from API Server as it is no longer running")
pc.deletePodsFromKubernetes.EnqueueWithoutRateLimit(key)
pc.deletePodsFromKubernetes.EnqueueWithoutRateLimit(ctx, key)
return nil
}
obj, ok := pc.knownPods.Load(key)
@@ -518,7 +536,7 @@ func (pc *PodController) syncPodInProvider(ctx context.Context, pod *corev1.Pod,
return err
}
pc.deletePodsFromKubernetes.EnqueueWithoutRateLimitWithDelay(key, time.Second*time.Duration(*pod.DeletionGracePeriodSeconds))
pc.deletePodsFromKubernetes.EnqueueWithoutRateLimitWithDelay(ctx, key, time.Second*time.Duration(*pod.DeletionGracePeriodSeconds))
return nil
}