Add support for tracing via OpenCencus

This adds a few flags for configuring the tracer.
Includes support for jaeger tracing (built into OC).
This commit is contained in:
Brian Goff
2018-09-19 18:01:39 -07:00
parent 43d32d2301
commit 682b2bccf8
139 changed files with 19834 additions and 42 deletions

View File

@@ -26,6 +26,7 @@ import (
client "github.com/virtual-kubelet/virtual-kubelet/providers/azure/client"
"github.com/virtual-kubelet/virtual-kubelet/providers/azure/client/aci"
"github.com/virtual-kubelet/virtual-kubelet/providers/azure/client/network"
"go.opencensus.io/trace"
"k8s.io/api/core/v1"
k8serr "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
@@ -490,9 +491,20 @@ func getKubeProxyExtension(secretPath, masterURI, clusterCIDR string) (*aci.Exte
return &extension, nil
}
func addAzureAttributes(span *trace.Span, p *ACIProvider) {
span.AddAttributes(
trace.StringAttribute("azure.resourceGroup", p.resourceGroup),
trace.StringAttribute("azure.region", p.region),
)
}
// CreatePod accepts a Pod definition and creates
// an ACI deployment
func (p *ACIProvider) CreatePod(ctx context.Context, pod *v1.Pod) error {
ctx, span := trace.StartSpan(ctx, "aci.CreatePod")
defer span.End()
addAzureAttributes(span, p)
var containerGroup aci.ContainerGroup
containerGroup.Location = p.region
containerGroup.RestartPolicy = aci.ContainerGroupRestartPolicy(pod.Spec.RestartPolicy)
@@ -691,12 +703,20 @@ func (p *ACIProvider) UpdatePod(ctx context.Context, pod *v1.Pod) error {
// DeletePod deletes the specified pod out of ACI.
func (p *ACIProvider) DeletePod(ctx context.Context, pod *v1.Pod) error {
ctx, span := trace.StartSpan(ctx, "aci.DeletePod")
defer span.End()
addAzureAttributes(span, p)
return p.aciClient.DeleteContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", pod.Namespace, pod.Name))
}
// GetPod returns a pod by name that is running inside ACI
// returns nil if a pod by that name is not found.
func (p *ACIProvider) GetPod(ctx context.Context, namespace, name string) (*v1.Pod, error) {
ctx, span := trace.StartSpan(ctx, "aci.GetPod")
defer span.End()
addAzureAttributes(span, p)
cg, err, status := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, name))
if err != nil {
if status != nil && *status == http.StatusNotFound {
@@ -714,6 +734,10 @@ func (p *ACIProvider) GetPod(ctx context.Context, namespace, name string) (*v1.P
// GetContainerLogs returns the logs of a pod by name that is running inside ACI.
func (p *ACIProvider) GetContainerLogs(ctx context.Context, namespace, podName, containerName string, tail int) (string, error) {
ctx, span := trace.StartSpan(ctx, "aci.GetContainerLogs")
defer span.End()
addAzureAttributes(span, p)
logContent := ""
cg, err, _ := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, podName))
if err != nil {
@@ -725,17 +749,18 @@ func (p *ACIProvider) GetContainerLogs(ctx context.Context, namespace, podName,
}
// get logs from cg
retry := 10
for i := 0; i < retry; i++ {
var retries int
for retries = 0; retries < retry; retries++ {
cLogs, err := p.aciClient.GetContainerLogs(ctx, p.resourceGroup, cg.Name, containerName, tail)
if err != nil {
log.G(ctx).WithField("method", "GetContainerLogs").WithError(err).Debug("Error getting container logs, retrying")
span.Annotate(nil, "Error getting container logs, retrying")
time.Sleep(5000 * time.Millisecond)
} else {
logContent = cLogs.Content
break
}
}
return logContent, err
}
@@ -824,6 +849,10 @@ func (p *ACIProvider) ExecInContainer(name string, uid types.UID, container stri
// GetPodStatus returns the status of a pod by name that is running inside ACI
// returns nil if a pod by that name is not found.
func (p *ACIProvider) GetPodStatus(ctx context.Context, namespace, name string) (*v1.PodStatus, error) {
ctx, span := trace.StartSpan(ctx, "aci.GetPodStatus")
defer span.End()
addAzureAttributes(span, p)
pod, err := p.GetPod(ctx, namespace, name)
if err != nil {
return nil, err
@@ -838,6 +867,10 @@ func (p *ACIProvider) GetPodStatus(ctx context.Context, namespace, name string)
// GetPods returns a list of all pods known to be running within ACI.
func (p *ACIProvider) GetPods(ctx context.Context) ([]*v1.Pod, error) {
ctx, span := trace.StartSpan(ctx, "aci.GetPods")
defer span.End()
addAzureAttributes(span, p)
cgs, err := p.aciClient.ListContainerGroups(ctx, p.resourceGroup)
if err != nil {
return nil, err

View File

@@ -4,6 +4,10 @@ import (
"fmt"
"net/http"
"go.opencensus.io/plugin/ochttp/propagation/b3"
"go.opencensus.io/plugin/ochttp"
azure "github.com/virtual-kubelet/virtual-kubelet/providers/azure/client"
)
@@ -40,6 +44,12 @@ func NewClient(auth *azure.Authentication) (*Client, error) {
if err != nil {
return nil, fmt.Errorf("Creating Azure client failed: %v", err)
}
hc := client.HTTPClient
hc.Transport = &ochttp.Transport{
Base: hc.Transport,
Propagation: &b3.HTTPFormat{},
NewClientTrace: ochttp.NewSpanAnnotatingClientTrace,
}
return &Client{hc: client.HTTPClient, auth: auth}, nil
}

View File

@@ -53,7 +53,7 @@ func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, container
// Decode the body from the response.
if resp.Body == nil {
return nil, errors.New("Create container group returned an empty body in the response"), &resp.StatusCode
return nil, errors.New("Get container group returned an empty body in the response"), &resp.StatusCode
}
var cg ContainerGroup
if err := json.NewDecoder(resp.Body).Decode(&cg); err != nil {

View File

@@ -72,7 +72,7 @@ func (c *Client) GetContainerGroupMetrics(ctx context.Context, resourceGroup, co
return nil, errors.Wrap(err, "expanding URL with parameters failed")
}
// SEnd the request.
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrap(err, "sending get container group metrics request failed")

View File

@@ -7,6 +7,7 @@ import (
"github.com/pkg/errors"
"github.com/virtual-kubelet/virtual-kubelet/providers/azure/client/aci"
"go.opencensus.io/trace"
"golang.org/x/sync/errgroup"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -15,12 +16,19 @@ import (
// GetStatsSummary returns the stats summary for pods running on ACI
func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summary, err error) {
ctx, span := trace.StartSpan(ctx, "GetSummaryStats")
defer span.End()
addAzureAttributes(span, p)
p.metricsSync.Lock()
defer p.metricsSync.Unlock()
span.Annotate(nil, "acquired metrics mutex")
if time.Now().Sub(p.metricsSyncTime) < time.Minute {
span.AddAttributes(trace.BoolAttribute("preCachedResult", true), trace.StringAttribute("cachedResultSampleTime", p.metricsSyncTime.String()))
return p.lastMetric, nil
}
span.AddAttributes(trace.BoolAttribute("preCachedResult", false), trace.StringAttribute("cachedResultSampleTime", p.metricsSyncTime.String()))
select {
case <-ctx.Done():
@@ -37,6 +45,7 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
}()
pods := p.resourceManager.GetPods()
var errGroup errgroup.Group
chResult := make(chan stats.PodStats, len(pods))
@@ -48,7 +57,16 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
if pod.Status.Phase != v1.PodRunning {
continue
}
pod := pod
errGroup.Go(func() error {
ctx, span := trace.StartSpan(ctx, "getPodMetrics")
defer span.End()
span.AddAttributes(
trace.StringAttribute("UID", string(pod.UID)),
trace.StringAttribute("Name", pod.Name),
trace.StringAttribute("Namespace", pod.Namespace),
)
select {
case <-ctx.Done():
return ctx.Err()
@@ -58,6 +76,8 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
<-sema
}()
span.Annotate(nil, "Acquired semaphore")
cgName := containerGroupName(pod)
// cpu/mem and net stats are split because net stats do not support container level detail
systemStats, err := p.aciClient.GetContainerGroupMetrics(ctx, p.resourceGroup, cgName, aci.MetricsRequest{
@@ -68,8 +88,10 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
Types: []aci.MetricType{aci.MetricTypeCPUUsage, aci.MetricTypeMemoryUsage},
})
if err != nil {
span.SetStatus(trace.Status{Code: trace.StatusCodeUnknown, Message: err.Error()})
return errors.Wrapf(err, "error fetching cpu/mem stats for container group %s", cgName)
}
span.Annotate(nil, "Got system stats")
netStats, err := p.aciClient.GetContainerGroupMetrics(ctx, p.resourceGroup, cgName, aci.MetricsRequest{
Start: start,
@@ -78,8 +100,10 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
Types: []aci.MetricType{aci.MetricTyperNetworkBytesRecievedPerSecond, aci.MetricTyperNetworkBytesTransmittedPerSecond},
})
if err != nil {
span.SetStatus(trace.Status{Code: trace.StatusCodeUnknown, Message: err.Error()})
return errors.Wrapf(err, "error fetching network stats for container group %s", cgName)
}
span.Annotate(nil, "Got network stats")
chResult <- collectMetrics(pod, systemStats, netStats)
return nil
@@ -90,6 +114,7 @@ func (p *ACIProvider) GetStatsSummary(ctx context.Context) (summary *stats.Summa
return nil, errors.Wrap(err, "error in request to fetch container group metrics")
}
close(chResult)
span.Annotate([]trace.Attribute{trace.Int64Attribute("nPods", int64(len(pods)))}, "Collected stats from Azure")
var s stats.Summary
s.Node = stats.NodeStats{