Add support for tracing via OpenCencus
This adds a few flags for configuring the tracer. Includes support for jaeger tracing (built into OC).
This commit is contained in:
@@ -26,6 +26,7 @@ import (
|
||||
client "github.com/virtual-kubelet/virtual-kubelet/providers/azure/client"
|
||||
"github.com/virtual-kubelet/virtual-kubelet/providers/azure/client/aci"
|
||||
"github.com/virtual-kubelet/virtual-kubelet/providers/azure/client/network"
|
||||
"go.opencensus.io/trace"
|
||||
"k8s.io/api/core/v1"
|
||||
k8serr "k8s.io/apimachinery/pkg/api/errors"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
@@ -490,9 +491,20 @@ func getKubeProxyExtension(secretPath, masterURI, clusterCIDR string) (*aci.Exte
|
||||
return &extension, nil
|
||||
}
|
||||
|
||||
func addAzureAttributes(span *trace.Span, p *ACIProvider) {
|
||||
span.AddAttributes(
|
||||
trace.StringAttribute("azure.resourceGroup", p.resourceGroup),
|
||||
trace.StringAttribute("azure.region", p.region),
|
||||
)
|
||||
}
|
||||
|
||||
// CreatePod accepts a Pod definition and creates
|
||||
// an ACI deployment
|
||||
func (p *ACIProvider) CreatePod(ctx context.Context, pod *v1.Pod) error {
|
||||
ctx, span := trace.StartSpan(ctx, "aci.CreatePod")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
var containerGroup aci.ContainerGroup
|
||||
containerGroup.Location = p.region
|
||||
containerGroup.RestartPolicy = aci.ContainerGroupRestartPolicy(pod.Spec.RestartPolicy)
|
||||
@@ -691,12 +703,20 @@ func (p *ACIProvider) UpdatePod(ctx context.Context, pod *v1.Pod) error {
|
||||
|
||||
// DeletePod deletes the specified pod out of ACI.
|
||||
func (p *ACIProvider) DeletePod(ctx context.Context, pod *v1.Pod) error {
|
||||
ctx, span := trace.StartSpan(ctx, "aci.DeletePod")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
return p.aciClient.DeleteContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", pod.Namespace, pod.Name))
|
||||
}
|
||||
|
||||
// GetPod returns a pod by name that is running inside ACI
|
||||
// returns nil if a pod by that name is not found.
|
||||
func (p *ACIProvider) GetPod(ctx context.Context, namespace, name string) (*v1.Pod, error) {
|
||||
ctx, span := trace.StartSpan(ctx, "aci.GetPod")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
cg, err, status := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, name))
|
||||
if err != nil {
|
||||
if status != nil && *status == http.StatusNotFound {
|
||||
@@ -714,6 +734,10 @@ func (p *ACIProvider) GetPod(ctx context.Context, namespace, name string) (*v1.P
|
||||
|
||||
// GetContainerLogs returns the logs of a pod by name that is running inside ACI.
|
||||
func (p *ACIProvider) GetContainerLogs(ctx context.Context, namespace, podName, containerName string, tail int) (string, error) {
|
||||
ctx, span := trace.StartSpan(ctx, "aci.GetContainerLogs")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
logContent := ""
|
||||
cg, err, _ := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, podName))
|
||||
if err != nil {
|
||||
@@ -725,17 +749,18 @@ func (p *ACIProvider) GetContainerLogs(ctx context.Context, namespace, podName,
|
||||
}
|
||||
// get logs from cg
|
||||
retry := 10
|
||||
for i := 0; i < retry; i++ {
|
||||
var retries int
|
||||
for retries = 0; retries < retry; retries++ {
|
||||
cLogs, err := p.aciClient.GetContainerLogs(ctx, p.resourceGroup, cg.Name, containerName, tail)
|
||||
if err != nil {
|
||||
log.G(ctx).WithField("method", "GetContainerLogs").WithError(err).Debug("Error getting container logs, retrying")
|
||||
span.Annotate(nil, "Error getting container logs, retrying")
|
||||
time.Sleep(5000 * time.Millisecond)
|
||||
} else {
|
||||
logContent = cLogs.Content
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return logContent, err
|
||||
}
|
||||
|
||||
@@ -824,6 +849,10 @@ func (p *ACIProvider) ExecInContainer(name string, uid types.UID, container stri
|
||||
// GetPodStatus returns the status of a pod by name that is running inside ACI
|
||||
// returns nil if a pod by that name is not found.
|
||||
func (p *ACIProvider) GetPodStatus(ctx context.Context, namespace, name string) (*v1.PodStatus, error) {
|
||||
ctx, span := trace.StartSpan(ctx, "aci.GetPodStatus")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
pod, err := p.GetPod(ctx, namespace, name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -838,6 +867,10 @@ func (p *ACIProvider) GetPodStatus(ctx context.Context, namespace, name string)
|
||||
|
||||
// GetPods returns a list of all pods known to be running within ACI.
|
||||
func (p *ACIProvider) GetPods(ctx context.Context) ([]*v1.Pod, error) {
|
||||
ctx, span := trace.StartSpan(ctx, "aci.GetPods")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
cgs, err := p.aciClient.ListContainerGroups(ctx, p.resourceGroup)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
@@ -4,6 +4,10 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
|
||||
"go.opencensus.io/plugin/ochttp/propagation/b3"
|
||||
|
||||
"go.opencensus.io/plugin/ochttp"
|
||||
|
||||
azure "github.com/virtual-kubelet/virtual-kubelet/providers/azure/client"
|
||||
)
|
||||
|
||||
@@ -40,6 +44,12 @@ func NewClient(auth *azure.Authentication) (*Client, error) {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Creating Azure client failed: %v", err)
|
||||
}
|
||||
hc := client.HTTPClient
|
||||
hc.Transport = &ochttp.Transport{
|
||||
Base: hc.Transport,
|
||||
Propagation: &b3.HTTPFormat{},
|
||||
NewClientTrace: ochttp.NewSpanAnnotatingClientTrace,
|
||||
}
|
||||
|
||||
return &Client{hc: client.HTTPClient, auth: auth}, nil
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, container
|
||||
|
||||
// Decode the body from the response.
|
||||
if resp.Body == nil {
|
||||
return nil, errors.New("Create container group returned an empty body in the response"), &resp.StatusCode
|
||||
return nil, errors.New("Get container group returned an empty body in the response"), &resp.StatusCode
|
||||
}
|
||||
var cg ContainerGroup
|
||||
if err := json.NewDecoder(resp.Body).Decode(&cg); err != nil {
|
||||
|
||||
@@ -72,7 +72,7 @@ func (c *Client) GetContainerGroupMetrics(ctx context.Context, resourceGroup, co
|
||||
return nil, errors.Wrap(err, "expanding URL with parameters failed")
|
||||
}
|
||||
|
||||
// SEnd the request.
|
||||
// Send the request.
|
||||
resp, err := c.hc.Do(req)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "sending get container group metrics request failed")
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/virtual-kubelet/virtual-kubelet/providers/azure/client/aci"
|
||||
"go.opencensus.io/trace"
|
||||
"golang.org/x/sync/errgroup"
|
||||
"k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
@@ -15,12 +16,19 @@ import (
|
||||
|
||||
// GetStatsSummary returns the stats summary for pods running on ACI
|
||||
func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summary, err error) {
|
||||
ctx, span := trace.StartSpan(ctx, "GetSummaryStats")
|
||||
defer span.End()
|
||||
addAzureAttributes(span, p)
|
||||
|
||||
p.metricsSync.Lock()
|
||||
defer p.metricsSync.Unlock()
|
||||
span.Annotate(nil, "acquired metrics mutex")
|
||||
|
||||
if time.Now().Sub(p.metricsSyncTime) < time.Minute {
|
||||
span.AddAttributes(trace.BoolAttribute("preCachedResult", true), trace.StringAttribute("cachedResultSampleTime", p.metricsSyncTime.String()))
|
||||
return p.lastMetric, nil
|
||||
}
|
||||
span.AddAttributes(trace.BoolAttribute("preCachedResult", false), trace.StringAttribute("cachedResultSampleTime", p.metricsSyncTime.String()))
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -37,6 +45,7 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
|
||||
}()
|
||||
|
||||
pods := p.resourceManager.GetPods()
|
||||
|
||||
var errGroup errgroup.Group
|
||||
chResult := make(chan stats.PodStats, len(pods))
|
||||
|
||||
@@ -48,7 +57,16 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
|
||||
if pod.Status.Phase != v1.PodRunning {
|
||||
continue
|
||||
}
|
||||
pod := pod
|
||||
errGroup.Go(func() error {
|
||||
ctx, span := trace.StartSpan(ctx, "getPodMetrics")
|
||||
defer span.End()
|
||||
span.AddAttributes(
|
||||
trace.StringAttribute("UID", string(pod.UID)),
|
||||
trace.StringAttribute("Name", pod.Name),
|
||||
trace.StringAttribute("Namespace", pod.Namespace),
|
||||
)
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
@@ -58,6 +76,8 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
|
||||
<-sema
|
||||
}()
|
||||
|
||||
span.Annotate(nil, "Acquired semaphore")
|
||||
|
||||
cgName := containerGroupName(pod)
|
||||
// cpu/mem and net stats are split because net stats do not support container level detail
|
||||
systemStats, err := p.aciClient.GetContainerGroupMetrics(ctx, p.resourceGroup, cgName, aci.MetricsRequest{
|
||||
@@ -68,8 +88,10 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
|
||||
Types: []aci.MetricType{aci.MetricTypeCPUUsage, aci.MetricTypeMemoryUsage},
|
||||
})
|
||||
if err != nil {
|
||||
span.SetStatus(trace.Status{Code: trace.StatusCodeUnknown, Message: err.Error()})
|
||||
return errors.Wrapf(err, "error fetching cpu/mem stats for container group %s", cgName)
|
||||
}
|
||||
span.Annotate(nil, "Got system stats")
|
||||
|
||||
netStats, err := p.aciClient.GetContainerGroupMetrics(ctx, p.resourceGroup, cgName, aci.MetricsRequest{
|
||||
Start: start,
|
||||
@@ -78,8 +100,10 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
|
||||
Types: []aci.MetricType{aci.MetricTyperNetworkBytesRecievedPerSecond, aci.MetricTyperNetworkBytesTransmittedPerSecond},
|
||||
})
|
||||
if err != nil {
|
||||
span.SetStatus(trace.Status{Code: trace.StatusCodeUnknown, Message: err.Error()})
|
||||
return errors.Wrapf(err, "error fetching network stats for container group %s", cgName)
|
||||
}
|
||||
span.Annotate(nil, "Got network stats")
|
||||
|
||||
chResult <- collectMetrics(pod, systemStats, netStats)
|
||||
return nil
|
||||
@@ -90,6 +114,7 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
|
||||
return nil, errors.Wrap(err, "error in request to fetch container group metrics")
|
||||
}
|
||||
close(chResult)
|
||||
span.Annotate([]trace.Attribute{trace.Int64Attribute("nPods", int64(len(pods)))}, "Collected stats from Azure")
|
||||
|
||||
var s stats.Summary
|
||||
s.Node = stats.NodeStats{
|
||||
|
||||
Reference in New Issue
Block a user