From bab9c59ac810fcbcd737fb0e4b1e251dd43b92eb Mon Sep 17 00:00:00 2001 From: Vipin Duleb Date: Tue, 2 Apr 2019 18:11:35 -0700 Subject: [PATCH] GPU support in ACI provider (#563) * GPU support in ACI provider --- Gopkg.lock | 8 +- Gopkg.toml | 2 +- providers/azure/aci.go | 146 +++++++-- providers/azure/aciMock.go | 18 ++ providers/azure/aci_test.go | 291 +++++++++++++++++- .../azure-aci/client/aci/analytics.go | 2 + .../azure-aci/client/aci/client.go | 4 +- .../azure-aci/client/aci/exec.go | 5 +- .../azure-aci/client/aci/get.go | 16 +- .../azure-aci/client/aci/rp.go | 76 +++++ .../azure-aci/client/aci/types.go | 52 +++- 11 files changed, 557 insertions(+), 63 deletions(-) create mode 100644 vendor/github.com/virtual-kubelet/azure-aci/client/aci/rp.go diff --git a/Gopkg.lock b/Gopkg.lock index fe46af7b9..af2ec2f59 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -24,7 +24,6 @@ packages = [ "services/batch/2017-09-01.6.0/batch", "services/network/mgmt/2018-08-01/network", - "services/preview/servicefabricmesh/mgmt/2018-07-01-preview/servicefabricmesh", "version", ] pruneopts = "NUT" @@ -1076,7 +1075,6 @@ packages = [ "assert", "mock", - "require", ] pruneopts = "NUT" revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71" @@ -1110,7 +1108,7 @@ version = "v0.10.2" [[projects]] - digest = "1:9a98ebb3ee9bb59770d26639de8f2c9a65d695daaa1c1957d34de84a6f40c282" + digest = "1:868e54a0d7eefb6a5fc88b7379c56ea70d50fb2c7905690f83bb9b7474a321e1" name = "github.com/virtual-kubelet/azure-aci" packages = [ "client", @@ -1119,8 +1117,8 @@ "client/network", ] pruneopts = "NUT" - revision = "2e12def8b355625e62ffdb319e7c0b26d1d9c15d" - version = "v0.1.0" + revision = "a846478f02b6b4219bf5bd8ea5d608513ef623f7" + version = "v0.2.0" [[projects]] digest = "1:a707ec1c12a88afc978307bca7f40bde7bd6b6434ceee41f8d7c6985e245dbdb" diff --git a/Gopkg.toml b/Gopkg.toml index d7da59493..9dd52365d 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -140,7 +140,7 @@ [[constraint]] name = "github.com/virtual-kubelet/azure-aci" - version = "v0.1.0" + version = "v0.2.0" [[constraint]] diff --git a/providers/azure/aci.go b/providers/azure/aci.go index a82ed332d..46f1c25f7 100644 --- a/providers/azure/aci.go +++ b/providers/azure/aci.go @@ -53,6 +53,12 @@ const ( maxDNSSearchListChars = 256 ) +const ( + gpuResourceName v1.ResourceName = "nvidia.com/gpu" + gpuTypeAnnotation = "virtual-kubelet.io/gpu-type" +) + + // ACIProvider implements the virtual-kubelet provider interface and communicates with Azure's ACI APIs. type ACIProvider struct { aciClient *aci.Client @@ -64,6 +70,8 @@ type ACIProvider struct { cpu string memory string pods string + gpu string + gpuSKUs []aci.GPUSKU internalIP string daemonEndpointPort int32 diagnostics *aci.ContainerGroupDiagnostics @@ -260,21 +268,8 @@ func NewACIProvider(config string, rm *manager.ResourceManager, nodeName, operat return nil, errors.New(unsupportedRegionMessage) } - // Set sane defaults for Capacity in case config is not supplied - p.cpu = "800" - p.memory = "4Ti" - p.pods = "800" - - if cpuQuota := os.Getenv("ACI_QUOTA_CPU"); cpuQuota != "" { - p.cpu = cpuQuota - } - - if memoryQuota := os.Getenv("ACI_QUOTA_MEMORY"); memoryQuota != "" { - p.memory = memoryQuota - } - - if podsQuota := os.Getenv("ACI_QUOTA_POD"); podsQuota != "" { - p.pods = podsQuota + if err := p.setupCapacity(context.TODO()); err != nil { + return nil, err } p.operatingSystem = operatingSystem @@ -324,6 +319,54 @@ func NewACIProvider(config string, rm *manager.ResourceManager, nodeName, operat return &p, err } +func (p *ACIProvider) setupCapacity(ctx context.Context) error { + ctx, span := trace.StartSpan(ctx, "setupCapacity") + defer span.End() + logger := log.G(ctx).WithField("method", "setupCapacity") + + // Set sane defaults for Capacity in case config is not supplied + p.cpu = "800" + p.memory = "4Ti" + p.pods = "800" + + if cpuQuota := os.Getenv("ACI_QUOTA_CPU"); cpuQuota != "" { + p.cpu = cpuQuota + } + + if memoryQuota := os.Getenv("ACI_QUOTA_MEMORY"); memoryQuota != "" { + p.memory = memoryQuota + } + + if podsQuota := os.Getenv("ACI_QUOTA_POD"); podsQuota != "" { + p.pods = podsQuota + } + + metadata, err := p.aciClient.GetResourceProviderMetadata(ctx) + + if err != nil { + msg := "Unable to fetch the ACI metadata" + logger.WithError(err).Error(msg) + return err + } + + if metadata == nil || metadata.GPURegionalSKUs == nil { + logger.Warn("ACI GPU capacity is not enabled. GPU capacity will be disabled") + return nil + } + + for _, regionalSKU := range metadata.GPURegionalSKUs { + if strings.EqualFold(regionalSKU.Location, p.region) && len(regionalSKU.SKUs) != 0 { + p.gpu = "100" + if gpu := os.Getenv("ACI_QUOTA_GPU"); gpu != "" { + p.gpu = gpu + } + p.gpuSKUs = regionalSKU.SKUs + } + } + + return nil +} + func (p *ACIProvider) setupNetworkProfile(auth *client.Authentication) error { c, err := network.NewClient(auth, p.extraUserAgent) if err != nil { @@ -706,7 +749,7 @@ func (p *ACIProvider) GetPod(ctx context.Context, namespace, name string) (*v1.P defer span.End() ctx = addAzureAttributes(ctx, span, p) - cg, err, status := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, name)) + cg, status, err := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, name)) if err != nil { if status != nil && *status == http.StatusNotFound { return nil, nil @@ -728,7 +771,7 @@ func (p *ACIProvider) GetContainerLogs(ctx context.Context, namespace, podName, ctx = addAzureAttributes(ctx, span, p) logContent := "" - cg, err, _ := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, podName)) + cg, _, err := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, podName)) if err != nil { return logContent, err } @@ -768,7 +811,7 @@ func (p *ACIProvider) ExecInContainer(name string, uid types.UID, container stri defer errstream.Close() } - cg, err, _ := p.aciClient.GetContainerGroup(context.TODO(), p.resourceGroup, name) + cg, _, err := p.aciClient.GetContainerGroup(context.TODO(), p.resourceGroup, name) if err != nil { return err } @@ -789,10 +832,10 @@ func (p *ACIProvider) ExecInContainer(name string, uid types.UID, container stri return err } - wsUri := xcrsp.WebSocketUri + wsURI := xcrsp.WebSocketURI password := xcrsp.Password - c, _, _ := websocket.DefaultDialer.Dial(wsUri, nil) + c, _, _ := websocket.DefaultDialer.Dial(wsURI, nil) c.WriteMessage(websocket.TextMessage, []byte(password)) // Websocket password needs to be sent before WS terminal is active // Cleanup on exit @@ -889,11 +932,17 @@ func (p *ACIProvider) GetPods(ctx context.Context) ([]*v1.Pod, error) { // Capacity returns a resource list containing the capacity limits set for ACI. func (p *ACIProvider) Capacity(ctx context.Context) v1.ResourceList { - return v1.ResourceList{ - "cpu": resource.MustParse(p.cpu), - "memory": resource.MustParse(p.memory), - "pods": resource.MustParse(p.pods), + resourceList := v1.ResourceList{ + v1.ResourceCPU: resource.MustParse(p.cpu), + v1.ResourceMemory: resource.MustParse(p.memory), + v1.ResourcePods: resource.MustParse(p.pods), } + + if p.gpu != "" { + resourceList[gpuResourceName] = resource.MustParse(p.gpu) + } + + return resourceList } // NodeConditions returns a list of conditions (Ready, OutOfDisk, etc), for updates to the node status @@ -1146,7 +1195,7 @@ func (p *ACIProvider) getContainers(pod *v1.Pod) ([]aci.Container, error) { } c.Resources = aci.ResourceRequirements{ - Requests: &aci.ResourceRequests{ + Requests: &aci.ComputeResources{ CPU: cpuRequest, MemoryInGB: memoryRequest, }, @@ -1163,10 +1212,29 @@ func (p *ACIProvider) getContainers(pod *v1.Pod) ([]aci.Container, error) { memoryLimit = float64(container.Resources.Limits.Memory().Value()) / 1000000000.00 } - c.Resources.Limits = &aci.ResourceLimits{ + c.Resources.Limits = &aci.ComputeResources{ CPU: cpuLimit, MemoryInGB: memoryLimit, } + + if gpu, ok := container.Resources.Limits[gpuResourceName]; ok { + sku, err := p.getGPUSKU(pod) + if err != nil { + return nil, err + } + + if gpu.Value() == 0 { + return nil, errors.New("GPU must be a integer number") + } + + gpuResource := &aci.GPUResource{ + Count: int32(gpu.Value()), + SKU: sku, + } + + c.Resources.Requests.GPU = gpuResource + c.Resources.Limits.GPU = gpuResource + } } if container.LivenessProbe != nil { @@ -1190,6 +1258,24 @@ func (p *ACIProvider) getContainers(pod *v1.Pod) ([]aci.Container, error) { return containers, nil } +func (p *ACIProvider) getGPUSKU(pod *v1.Pod) (aci.GPUSKU, error) { + if len(p.gpuSKUs) == 0 { + return "", fmt.Errorf("The pod requires GPU resource, but ACI doesn't provide GPU enabled container group in region %s", p.region) + } + + if desiredSKU, ok := pod.Annotations[gpuTypeAnnotation]; ok { + for _, supportedSKU := range p.gpuSKUs { + if strings.EqualFold(string(desiredSKU), string(supportedSKU)) { + return supportedSKU, nil + } + } + + return "", fmt.Errorf("The pod requires GPU SKU %s, but ACI only supports SKUs %v in region %s", desiredSKU, p.region, p.gpuSKUs) + } + + return p.gpuSKUs[0], nil +} + func getProbe(probe *v1.Probe) (*aci.ContainerProbe, error) { if probe.Handler.Exec != nil && probe.Handler.HTTPGet != nil { @@ -1376,11 +1462,19 @@ func containerGroupToPod(cg *aci.ContainerGroup) (*v1.Pod, error) { }, } + if c.Resources.Requests.GPU != nil { + container.Resources.Requests[gpuResourceName] = resource.MustParse(fmt.Sprintf("%d", c.Resources.Requests.GPU.Count)) + } + if c.Resources.Limits != nil { container.Resources.Limits = v1.ResourceList{ v1.ResourceCPU: resource.MustParse(fmt.Sprintf("%g", c.Resources.Limits.CPU)), v1.ResourceMemory: resource.MustParse(fmt.Sprintf("%gG", c.Resources.Limits.MemoryInGB)), } + + if c.Resources.Limits.GPU != nil { + container.Resources.Limits[gpuResourceName] = resource.MustParse(fmt.Sprintf("%d", c.Resources.Requests.GPU.Count)) + } } containers = append(containers, container) diff --git a/providers/azure/aciMock.go b/providers/azure/aciMock.go index f98906ddc..f79023075 100644 --- a/providers/azure/aciMock.go +++ b/providers/azure/aciMock.go @@ -16,12 +16,14 @@ type ACIMock struct { OnCreate func(string, string, string, *aci.ContainerGroup) (int, interface{}) OnGetContainerGroups func(string, string) (int, interface{}) OnGetContainerGroup func(string, string, string) (int, interface{}) + OnGetRPManifest func() (int, interface{}) } const ( containerGroupsRoute = "/subscriptions/{subscriptionId}/resourceGroups/{resourceGroup}/providers/Microsoft.ContainerInstance/containerGroups" containerGroupRoute = containerGroupsRoute + "/{containerGroup}" containerGroupLogRoute = containerGroupRoute + "/containers/{containerName}/logs" + resourceProviderRoute = "/providers/Microsoft.ContainerInstance" ) // NewACIMock creates a new Azure Container Instance mock server. @@ -103,6 +105,22 @@ func (mock *ACIMock) start() { w.WriteHeader(http.StatusNotImplemented) }).Methods("GET") + router.HandleFunc( + resourceProviderRoute, + func(w http.ResponseWriter, r *http.Request) { + if mock.OnGetRPManifest != nil { + statusCode, response := mock.OnGetRPManifest() + w.WriteHeader(statusCode) + b := new(bytes.Buffer) + json.NewEncoder(b).Encode(response) + w.Write(b.Bytes()) + + return + } + + w.WriteHeader(http.StatusNotImplemented) + }).Methods("GET") + mock.server = httptest.NewServer(router) } diff --git a/providers/azure/aci_test.go b/providers/azure/aci_test.go index 4c8542fbc..f663dc356 100644 --- a/providers/azure/aci_test.go +++ b/providers/azure/aci_test.go @@ -35,6 +35,7 @@ const ( fakeClientSecret = "VGhpcyBpcyBhIHNlY3JldAo=" fakeTenantID = "8cb81aca-83fe-4c6f-b667-4ec09c45a8bf" fakeNodeName = "vk" + fakeRegion = "eastus" ) // Test make registry credential @@ -199,6 +200,166 @@ func TestCreatePodWithResourceRequestOnly(t *testing.T) { } } +// Tests create pod with default GPU SKU. +func TestCreatePodWithGPU(t *testing.T) { + aadServerMocker := NewAADMock() + aciServerMocker := NewACIMock() + + podName := "pod-" + uuid.New().String() + podNamespace := "ns-" + uuid.New().String() + gpuSKU := aci.GPUSKU("sku-" + uuid.New().String()) + + aciServerMocker.OnGetRPManifest = func() (int, interface{}) { + manifest := &aci.ResourceProviderManifest{ + Metadata: &aci.ResourceProviderMetadata{ + GPURegionalSKUs: []*aci.GPURegionalSKU{ + &aci.GPURegionalSKU{ + Location: fakeRegion, + SKUs: []aci.GPUSKU{gpuSKU, aci.K80, aci.P100}, + }, + }, + }, + } + + return http.StatusOK, manifest + } + + provider, err := createTestProvider(aadServerMocker, aciServerMocker) + if err != nil { + t.Fatalf("failed to create the test provider. %s", err.Error()) + return + } + + aciServerMocker.OnCreate = func(subscription, resourceGroup, containerGroup string, cg *aci.ContainerGroup) (int, interface{}) { + assert.Check(t, is.Equal(fakeSubscription, subscription), "Subscription doesn't match") + assert.Check(t, is.Equal(fakeResourceGroup, resourceGroup), "Resource group doesn't match") + assert.Check(t, is.Equal(podNamespace+"-"+podName, containerGroup), "Container group name is not expected") + assert.Check(t, cg.ContainerGroupProperties.Containers != nil, "Containers should not be nil") + assert.Check(t, is.Equal(1, len(cg.ContainerGroupProperties.Containers)), "1 Container is expected") + assert.Check(t, is.Equal("nginx", cg.ContainerGroupProperties.Containers[0].Name), "Container nginx is expected") + assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests != nil, "Container resource requests should not be nil") + assert.Check(t, is.Equal(1.98, cg.ContainerGroupProperties.Containers[0].Resources.Requests.CPU), "Request CPU is not expected") + assert.Check(t, is.Equal(3.4, cg.ContainerGroupProperties.Containers[0].Resources.Requests.MemoryInGB), "Request Memory is not expected") + assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU != nil, "Requests GPU is not expected") + assert.Check(t, is.Equal(int32(10), cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.Count), "Requests GPU Count is not expected") + assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.SKU), "Requests GPU SKU is not expected") + assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU != nil, "Limits GPU is not expected") + assert.Check(t, is.Equal(int32(10), cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.Count), "Requests GPU Count is not expected") + assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.SKU), "Requests GPU SKU is not expected") + + return http.StatusOK, cg + } + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: podNamespace, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "nginx", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse("1.981"), + "memory": resource.MustParse("3.49G"), + }, + Limits: v1.ResourceList{ + gpuResourceName: resource.MustParse("10"), + }, + }, + }, + }, + }, + } + + if err := provider.CreatePod(context.Background(), pod); err != nil { + t.Fatal("Failed to create pod", err) + } +} + + // Tests create pod with GPU SKU in annotation. +func TestCreatePodWithGPUSKU(t *testing.T) { + aadServerMocker := NewAADMock() + aciServerMocker := NewACIMock() + + podName := "pod-" + uuid.New().String() + podNamespace := "ns-" + uuid.New().String() + gpuSKU := aci.GPUSKU("sku-" + uuid.New().String()) + + aciServerMocker.OnGetRPManifest = func() (int, interface{}) { + manifest := &aci.ResourceProviderManifest{ + Metadata: &aci.ResourceProviderMetadata{ + GPURegionalSKUs: []*aci.GPURegionalSKU{ + &aci.GPURegionalSKU{ + Location: fakeRegion, + SKUs: []aci.GPUSKU{aci.K80, aci.P100, gpuSKU}, + }, + }, + }, + } + + return http.StatusOK, manifest + } + + provider, err := createTestProvider(aadServerMocker, aciServerMocker) + if err != nil { + t.Fatalf("failed to create the test provider. %s", err.Error()) + return + } + + aciServerMocker.OnCreate = func(subscription, resourceGroup, containerGroup string, cg *aci.ContainerGroup) (int, interface{}) { + assert.Check(t, is.Equal(fakeSubscription, subscription), "Subscription doesn't match") + assert.Check(t, is.Equal(fakeResourceGroup, resourceGroup), "Resource group doesn't match") + assert.Check(t, cg != nil, "Container group is nil") + assert.Check(t, is.Equal(podNamespace+"-"+podName, containerGroup), "Container group name is not expected") + assert.Check(t, cg.ContainerGroupProperties.Containers != nil, "Containers should not be nil") + assert.Check(t, is.Equal(1, len(cg.ContainerGroupProperties.Containers)), "1 Container is expected") + assert.Check(t, is.Equal("nginx", cg.ContainerGroupProperties.Containers[0].Name), "Container nginx is expected") + assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests != nil, "Container resource requests should not be nil") + assert.Check(t, is.Equal(1.98, cg.ContainerGroupProperties.Containers[0].Resources.Requests.CPU), "Request CPU is not expected") + assert.Check(t, is.Equal(3.4, cg.ContainerGroupProperties.Containers[0].Resources.Requests.MemoryInGB), "Request Memory is not expected") + assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU != nil, "Requests GPU is not expected") + assert.Check(t, is.Equal(int32(1), cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.Count), "Requests GPU Count is not expected") + assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.SKU), "Requests GPU SKU is not expected") + assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU != nil, "Limits GPU is not expected") + assert.Check(t, is.Equal(int32(1), cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.Count), "Requests GPU Count is not expected") + assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.SKU), "Requests GPU SKU is not expected") + + return http.StatusOK, cg + } + + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: podNamespace, + Annotations: map[string]string{ + gpuTypeAnnotation: string(gpuSKU), + }, + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "nginx", + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "cpu": resource.MustParse("1.981"), + "memory": resource.MustParse("3.49G"), + }, + Limits: v1.ResourceList{ + gpuResourceName: resource.MustParse("1"), + }, + }, + }, + }, + }, + } + + if err := provider.CreatePod(context.Background(), pod); err != nil { + t.Fatal("Failed to create pod", err) + } +} + // Tests create pod with both resource request and limit. func TestCreatePodWithResourceRequestAndLimit(t *testing.T) { _, aciServerMocker, provider, err := prepareMocks() @@ -314,7 +475,7 @@ func TestGetPodsWithoutResourceRequestsLimits(t *testing.T) { }, }, Resources: aci.ResourceRequirements{ - Requests: &aci.ResourceRequests{ + Requests: &aci.ComputeResources{ CPU: 0.99, MemoryInGB: 1.5, }, @@ -385,7 +546,7 @@ func TestGetPodWithoutResourceRequestsLimits(t *testing.T) { }, }, Resources: aci.ResourceRequirements{ - Requests: &aci.ResourceRequests{ + Requests: &aci.ComputeResources{ CPU: 0.99, MemoryInGB: 1.5, }, @@ -412,6 +573,94 @@ func TestGetPodWithoutResourceRequestsLimits(t *testing.T) { pod.Spec.Containers[0].Resources.Requests.Memory().Value()), "Containers[0].Resources.Requests.Memory doesn't match") } +// Tests get pod with GPU. +func TestGetPodWithGPU(t *testing.T) { + _, aciServerMocker, provider, err := prepareMocks() + + if err != nil { + t.Fatal("Unable to prepare the mocks", err) + } + + podName := "pod-" + uuid.New().String() + podNamespace := "ns-" + uuid.New().String() + + aciServerMocker.OnGetContainerGroup = func(subscription, resourceGroup, containerGroup string) (int, interface{}) { + assert.Equal(t, fakeSubscription, subscription, "Subscription doesn't match") + assert.Equal(t, fakeResourceGroup, resourceGroup, "Resource group doesn't match") + assert.Equal(t, podNamespace+"-"+podName, containerGroup, "Container group name is not expected") + + return http.StatusOK, aci.ContainerGroup{ + Tags: map[string]string{ + "NodeName": fakeNodeName, + }, + ContainerGroupProperties: aci.ContainerGroupProperties{ + ProvisioningState: "Creating", + Containers: []aci.Container{ + aci.Container{ + Name: "nginx", + ContainerProperties: aci.ContainerProperties{ + Image: "nginx", + Command: []string{"nginx", "-g", "daemon off;"}, + Ports: []aci.ContainerPort{ + { + Protocol: aci.ContainerNetworkProtocolTCP, + Port: 80, + }, + }, + Resources: aci.ResourceRequirements{ + Requests: &aci.ComputeResources{ + CPU: 0.99, + MemoryInGB: 1.5, + GPU: &aci.GPUResource{ + Count: 5, + SKU: aci.P100, + }, + }, + Limits: &aci.ComputeResources{ + GPU: &aci.GPUResource{ + Count: 5, + SKU: aci.P100, + }, + }, + }, + }, + }, + }, + }, + } + } + + pod, err := provider.GetPod(context.Background(), podNamespace, podName) + if err != nil { + t.Fatal("Failed to get pod", err) + } + + assert.Check(t, pod != nil, "Response pod should not be nil") + assert.Check(t, pod.Spec.Containers != nil, "Containers should not be nil") + assert.Check(t, pod.Spec.Containers[0].Resources.Requests != nil, "Containers[0].Resources.Requests should not be nil") + assert.Check( + t, + is.Equal(ptrQuantity(resource.MustParse("0.99")).Value(), pod.Spec.Containers[0].Resources.Requests.Cpu().Value()), + "Containers[0].Resources.Requests.CPU doesn't match") + assert.Check( + t, + is.Equal(ptrQuantity(resource.MustParse("1.5G")).Value(), pod.Spec.Containers[0].Resources.Requests.Memory().Value()), + "Containers[0].Resources.Requests.Memory doesn't match") + gpuQuantity, ok := pod.Spec.Containers[0].Resources.Requests[gpuResourceName] + assert.Check(t, is.Equal(ok, true), "Containers[0].Resources.Requests.GPU should not be nil") + assert.Check( + t, + is.Equal(ptrQuantity(resource.MustParse("5")).Value(), ptrQuantity(gpuQuantity).Value()), + "Containers[0].Resources.Requests.GPU.Count doesn't match") + assert.Check(t, pod.Spec.Containers[0].Resources.Limits != nil, "Containers[0].Resources.Limits should not be nil") + gpuQuantity, ok = pod.Spec.Containers[0].Resources.Limits[gpuResourceName] + assert.Check(t, is.Equal(ok, true), "Containers[0].Resources.Requests.GPU should not be nil") + assert.Check( + t, + is.Equal(ptrQuantity(resource.MustParse("5")).Value(), ptrQuantity(gpuQuantity).Value()), + "Containers[0].Resources.Limits.GPU.Count doesn't match") +} + func TestGetPodWithContainerID(t *testing.T) { _, aciServerMocker, provider, err := prepareMocks() @@ -451,7 +700,7 @@ func TestGetPodWithContainerID(t *testing.T) { }, }, Resources: aci.ResourceRequirements{ - Requests: &aci.ResourceRequests{ + Requests: &aci.ComputeResources{ CPU: 0.99, MemoryInGB: 1.5, }, @@ -531,6 +780,30 @@ func prepareMocks() (*AADMock, *ACIMock, *ACIProvider, error) { aadServerMocker := NewAADMock() aciServerMocker := NewACIMock() + aciServerMocker.OnGetRPManifest = func() (int, interface{}) { + manifest := &aci.ResourceProviderManifest{ + Metadata: &aci.ResourceProviderMetadata{ + GPURegionalSKUs: []*aci.GPURegionalSKU{ + &aci.GPURegionalSKU{ + Location: fakeRegion, + SKUs: []aci.GPUSKU{aci.K80, aci.P100, aci.V100}, + }, + }, + }, + } + + return http.StatusOK, manifest + } + + provider, err := createTestProvider(aadServerMocker, aciServerMocker) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to create the test provider %s", err.Error()) + } + + return aadServerMocker, aciServerMocker, provider, nil +} + +func createTestProvider(aadServerMocker *AADMock, aciServerMocker*ACIMock) (*ACIProvider, error) { auth := azure.NewAuthentication( azure.PublicCloud.Name, fakeClientID, @@ -543,7 +816,7 @@ func prepareMocks() (*AADMock, *ACIMock, *ACIProvider, error) { file, err := ioutil.TempFile("", "auth.json") if err != nil { - return nil, nil, nil, err + return nil, err } defer os.Remove(file.Name()) @@ -552,23 +825,25 @@ func prepareMocks() (*AADMock, *ACIMock, *ACIProvider, error) { json.NewEncoder(b).Encode(auth) if _, err := file.Write(b.Bytes()); err != nil { - return nil, nil, nil, err + return nil, err } os.Setenv("AZURE_AUTH_LOCATION", file.Name()) os.Setenv("ACI_RESOURCE_GROUP", fakeResourceGroup) + os.Setenv("ACI_REGION", fakeRegion) rm, err := manager.NewResourceManager(nil, nil, nil) + if err != nil { - return nil, nil, nil, err + return nil, err } provider, err := NewACIProvider("example.toml", rm, fakeNodeName, "Linux", "0.0.0.0", 10250) if err != nil { - return nil, nil, nil, err + return nil, err } - return aadServerMocker, aciServerMocker, provider, nil + return provider, nil } func ptrQuantity(q resource.Quantity) *resource.Quantity { diff --git a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/analytics.go b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/analytics.go index 0b1f00f3f..4156b5712 100644 --- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/analytics.go +++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/analytics.go @@ -7,6 +7,7 @@ import ( "io/ioutil" ) +// NewContainerGroupDiagnostics creates a container group diagnostics object func NewContainerGroupDiagnostics(logAnalyticsID, logAnalyticsKey string) (*ContainerGroupDiagnostics, error) { if logAnalyticsID == "" || logAnalyticsKey == "" { @@ -21,6 +22,7 @@ func NewContainerGroupDiagnostics(logAnalyticsID, logAnalyticsKey string) (*Cont }, nil } +// NewContainerGroupDiagnosticsFromFile creates a container group diagnostics object from the specified file func NewContainerGroupDiagnosticsFromFile(filepath string) (*ContainerGroupDiagnostics, error) { analyticsdata, err := ioutil.ReadFile(filepath) diff --git a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/client.go b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/client.go index 73f5a49e6..e6ac82563 100644 --- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/client.go +++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/client.go @@ -14,8 +14,8 @@ import ( const ( // BaseURI is the default URI used for compute services. baseURI = "https://management.azure.com" - defaultUserAgent = "virtual-kubelet/azure-arm-aci/2018-09-01" - apiVersion = "2018-09-01" + defaultUserAgent = "virtual-kubelet/azure-arm-aci/2018-10-01" + apiVersion = "2018-10-01" containerGroupURLPath = "subscriptions/{{.subscriptionId}}/resourceGroups/{{.resourceGroup}}/providers/Microsoft.ContainerInstance/containerGroups/{{.containerGroupName}}" containerGroupListURLPath = "subscriptions/{{.subscriptionId}}/providers/Microsoft.ContainerInstance/containerGroups" diff --git a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/exec.go b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/exec.go index 321c31d9e..096548b81 100644 --- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/exec.go +++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/exec.go @@ -11,12 +11,13 @@ import ( "github.com/virtual-kubelet/azure-aci/client/api" ) +// TerminalSizeRequest is the terminal size request type TerminalSizeRequest struct { Width int Height int } -// Starts the exec command for a specified container instance in a specified resource group and container group. +// LaunchExec starts the exec command for a specified container instance in a specified resource group and container group. // From: https://docs.microsoft.com/en-us/rest/api/container-instances/startcontainer/launchexec func (c *Client) LaunchExec(resourceGroup, containerGroupName, containerName, command string, terminalSize TerminalSizeRequest) (ExecResponse, error) { urlParams := url.Values{ @@ -35,7 +36,7 @@ func (c *Client) LaunchExec(resourceGroup, containerGroupName, containerName, co var xcrsp ExecResponse xcrsp.Password = "" - xcrsp.WebSocketUri = "" + xcrsp.WebSocketURI = "" b := new(bytes.Buffer) diff --git a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/get.go b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/get.go index 2d4c2a532..43ebf6c15 100644 --- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/get.go +++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/get.go @@ -14,7 +14,7 @@ import ( // GetContainerGroup gets an Azure Container Instance in the provided // resource group with the given container group name. // From: https://docs.microsoft.com/en-us/rest/api/container-instances/containergroups/get -func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, containerGroupName string) (*ContainerGroup, error, *int) { +func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, containerGroupName string) (*ContainerGroup, *int, error) { urlParams := url.Values{ "api-version": []string{apiVersion}, } @@ -26,7 +26,7 @@ func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, container // Create the request. req, err := http.NewRequest("GET", uri, nil) if err != nil { - return nil, fmt.Errorf("Creating get container group uri request failed: %v", err), nil + return nil, nil, fmt.Errorf("Creating get container group uri request failed: %v", err) } req = req.WithContext(ctx) @@ -36,29 +36,29 @@ func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, container "resourceGroup": resourceGroup, "containerGroupName": containerGroupName, }); err != nil { - return nil, fmt.Errorf("Expanding URL with parameters failed: %v", err), nil + return nil, nil, fmt.Errorf("Expanding URL with parameters failed: %v", err) } // Send the request. resp, err := c.hc.Do(req) if err != nil { - return nil, fmt.Errorf("Sending get container group request failed: %v", err), nil + return nil, nil, fmt.Errorf("Sending get container group request failed: %v", err) } defer resp.Body.Close() // 200 (OK) is a success response. if err := api.CheckResponse(resp); err != nil { - return nil, err, &resp.StatusCode + return nil, &resp.StatusCode, err } // Decode the body from the response. if resp.Body == nil { - return nil, errors.New("Get container group returned an empty body in the response"), &resp.StatusCode + return nil, &resp.StatusCode, errors.New("Get container group returned an empty body in the response") } var cg ContainerGroup if err := json.NewDecoder(resp.Body).Decode(&cg); err != nil { - return nil, fmt.Errorf("Decoding get container group response body failed: %v", err), &resp.StatusCode + return nil, &resp.StatusCode, fmt.Errorf("Decoding get container group response body failed: %v", err) } - return &cg, nil, &resp.StatusCode + return &cg, &resp.StatusCode, nil } diff --git a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/rp.go b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/rp.go new file mode 100644 index 000000000..83af8dee4 --- /dev/null +++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/rp.go @@ -0,0 +1,76 @@ +package aci + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + + "github.com/virtual-kubelet/azure-aci/client/api" +) + +const ( + resourceProviderURLPath = "providers/Microsoft.ContainerInstance" + resourceProviderAPIVersion = "2018-02-01" +) + +// GetResourceProviderMetadata gets the ACI resource provider metadata +func (c *Client) GetResourceProviderMetadata(ctx context.Context) (*ResourceProviderMetadata, error) { + manifest, err := c.getResourceProviderManifest(ctx) + if err != nil { + return nil, err + } + + if manifest == nil { + return nil, fmt.Errorf("The resource provider manifest is empty") + } + + if manifest.Metadata == nil { + return nil, fmt.Errorf("The resource provider metadata is empty") + } + + return manifest.Metadata, nil +} + +func (c *Client) getResourceProviderManifest(ctx context.Context) (*ResourceProviderManifest, error) { + urlParams := url.Values{ + "api-version": []string{resourceProviderAPIVersion}, + "$expand": []string{"metadata"}, + } + + // Create the url. + uri := api.ResolveRelative(c.auth.ResourceManagerEndpoint, resourceProviderURLPath) + uri += "?" + url.Values(urlParams).Encode() + + // Create the request. + req, err := http.NewRequest("GET", uri, nil) + if err != nil { + return nil, fmt.Errorf("Creating get resource provider manifest request failed: %v", err) + } + req = req.WithContext(ctx) + + // Send the request. + resp, err := c.hc.Do(req) + if err != nil { + return nil, fmt.Errorf("Sending get resource provider manifest request failed: %v", err) + } + defer resp.Body.Close() + + // 200 (OK) is a success response. + if err := api.CheckResponse(resp); err != nil { + return nil, err + } + + // Decode the body from the response. + if resp.Body == nil { + return nil, errors.New("Get resource provider manifest returned an empty body in the response") + } + var manifest ResourceProviderManifest + if err := json.NewDecoder(resp.Body).Decode(&manifest); err != nil { + return nil, fmt.Errorf("Decoding get resource provider manifest response body failed: %v", err) + } + + return &manifest, nil +} diff --git a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/types.go b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/types.go index 8c6e2f56f..2b07ed593 100644 --- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/types.go +++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/types.go @@ -237,22 +237,35 @@ type Resource struct { Tags map[string]string `json:"tags,omitempty"` } -// ResourceLimits is the resource limits. -type ResourceLimits struct { - MemoryInGB float64 `json:"memoryInGB,omitempty"` - CPU float64 `json:"cpu,omitempty"` +// GPUSKU enumerates the values for GPU SKUs +type GPUSKU string + +const ( + // K80 specifies the K80 GPU SKU + K80 GPUSKU = "K80" + // P100 specifies the P100 GPU SKU + P100 GPUSKU = "P100" + // V100 specifies the V100 GPU SKU + V100 GPUSKU = "V100" +) + +// GPUResource is the GPU resource for the container group. +type GPUResource struct { + Count int32 `json:"count"` + SKU GPUSKU `json:"sku"` } -// ResourceRequests is the resource requests. -type ResourceRequests struct { - MemoryInGB float64 `json:"memoryInGB,omitempty"` - CPU float64 `json:"cpu,omitempty"` +// ComputeResources is the compute resource. +type ComputeResources struct { + MemoryInGB float64 `json:"memoryInGB,omitempty"` + CPU float64 `json:"cpu,omitempty"` + GPU *GPUResource `json:"gpu,omitempty"` } // ResourceRequirements is the resource requirements. type ResourceRequirements struct { - Requests *ResourceRequests `json:"requests,omitempty"` - Limits *ResourceLimits `json:"limits,omitempty"` + Requests *ComputeResources `json:"requests,omitempty"` + Limits *ComputeResources `json:"limits,omitempty"` } // Usage is a single usage result @@ -305,7 +318,7 @@ type ExecRequest struct { // ExecResponse is a request for Launch Exec API response for ACI. type ExecResponse struct { - WebSocketUri string `json:"webSocketUri,omitempty"` + WebSocketURI string `json:"webSocketUri,omitempty"` Password string `json:"password,omitempty"` } @@ -488,3 +501,20 @@ const ( LogAnalyticsMetadataKeyNodeName string = "node-name" LogAnalyticsMetadataKeyClusterResourceID string = "cluster-resource-id" ) + +// GPURegionalSKU is the ACI GPU regional SKU +type GPURegionalSKU struct { + Location string `json:"location"` + SKUs []GPUSKU `json:"skus"` +} + +// ResourceProviderMetadata is the ACI resource provider metadata +type ResourceProviderMetadata struct { + VNetSupportRegions []string `json:"vnetSupportRegions,omitempty"` + GPURegionalSKUs []*GPURegionalSKU `json:"gpuRegionalSkus,omitempty"` +} + +// ResourceProviderManifest is the ACI resource provider manifest +type ResourceProviderManifest struct { + Metadata *ResourceProviderMetadata `json:"metadata"` +}