GPU support in ACI provider (#563)

* GPU support in ACI provider
2019-04-02 18:11:35 -07:00
parent 1dadd46e20
commit bab9c59ac8
11 changed files with 557 additions and 63 deletions
--- a/Gopkg.lock
+++ b/Gopkg.lock
@@ -24,7 +24,6 @@
  packages = [
    "services/batch/2017-09-01.6.0/batch",
    "services/network/mgmt/2018-08-01/network",
-    "services/preview/servicefabricmesh/mgmt/2018-07-01-preview/servicefabricmesh",
    "version",
  ]
  pruneopts = "NUT"
@@ -1076,7 +1075,6 @@
  packages = [
    "assert",
    "mock",
-    "require",
  ]
  pruneopts = "NUT"
  revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71"
@@ -1110,7 +1108,7 @@
  version = "v0.10.2"

 [[projects]]
-  digest = "1:9a98ebb3ee9bb59770d26639de8f2c9a65d695daaa1c1957d34de84a6f40c282"
+  digest = "1:868e54a0d7eefb6a5fc88b7379c56ea70d50fb2c7905690f83bb9b7474a321e1"
  name = "github.com/virtual-kubelet/azure-aci"
  packages = [
    "client",
@@ -1119,8 +1117,8 @@
    "client/network",
  ]
  pruneopts = "NUT"
-  revision = "2e12def8b355625e62ffdb319e7c0b26d1d9c15d"
-  version = "v0.1.0"
+  revision = "a846478f02b6b4219bf5bd8ea5d608513ef623f7"
+  version = "v0.2.0"

 [[projects]]
  digest = "1:a707ec1c12a88afc978307bca7f40bde7bd6b6434ceee41f8d7c6985e245dbdb"
--- a/Gopkg.toml
+++ b/Gopkg.toml
@@ -140,7 +140,7 @@

 [[constraint]]
  name = "github.com/virtual-kubelet/azure-aci"
-  version = "v0.1.0"
+  version = "v0.2.0"


 [[constraint]]
--- a/providers/azure/aci.go
+++ b/providers/azure/aci.go
@@ -53,6 +53,12 @@ const (
 	maxDNSSearchListChars = 256
 )

+const (
+	gpuResourceName v1.ResourceName = "nvidia.com/gpu"
+	gpuTypeAnnotation               = "virtual-kubelet.io/gpu-type"
+)
+
+
 // ACIProvider implements the virtual-kubelet provider interface and communicates with Azure's ACI APIs.
 type ACIProvider struct {
 	aciClient          *aci.Client
@@ -64,6 +70,8 @@ type ACIProvider struct {
 	cpu                string
 	memory             string
 	pods               string
+	gpu                string
+	gpuSKUs            []aci.GPUSKU
 	internalIP         string
 	daemonEndpointPort int32
 	diagnostics        *aci.ContainerGroupDiagnostics
@@ -260,21 +268,8 @@ func NewACIProvider(config string, rm *manager.ResourceManager, nodeName, operat
 		return nil, errors.New(unsupportedRegionMessage)
 	}

-	// Set sane defaults for Capacity in case config is not supplied
-	p.cpu = "800"
-	p.memory = "4Ti"
-	p.pods = "800"
-
-	if cpuQuota := os.Getenv("ACI_QUOTA_CPU"); cpuQuota != "" {
-		p.cpu = cpuQuota
-	}
-
-	if memoryQuota := os.Getenv("ACI_QUOTA_MEMORY"); memoryQuota != "" {
-		p.memory = memoryQuota
-	}
-
-	if podsQuota := os.Getenv("ACI_QUOTA_POD"); podsQuota != "" {
-		p.pods = podsQuota
+	if err := p.setupCapacity(context.TODO()); err != nil {
+		return nil, err
 	}

 	p.operatingSystem = operatingSystem
@@ -324,6 +319,54 @@ func NewACIProvider(config string, rm *manager.ResourceManager, nodeName, operat
 	return &p, err
 }

+func (p *ACIProvider) setupCapacity(ctx context.Context) error {
+	ctx, span := trace.StartSpan(ctx, "setupCapacity")
+	defer span.End()
+	logger := log.G(ctx).WithField("method", "setupCapacity")
+
+ 	// Set sane defaults for Capacity in case config is not supplied
+	p.cpu = "800"
+	p.memory = "4Ti"
+	p.pods = "800"
+
+ 	if cpuQuota := os.Getenv("ACI_QUOTA_CPU"); cpuQuota != "" {
+		p.cpu = cpuQuota
+	}
+
+ 	if memoryQuota := os.Getenv("ACI_QUOTA_MEMORY"); memoryQuota != "" {
+		p.memory = memoryQuota
+	}
+
+ 	if podsQuota := os.Getenv("ACI_QUOTA_POD"); podsQuota != "" {
+		p.pods = podsQuota
+	}
+
+ 	metadata, err := p.aciClient.GetResourceProviderMetadata(ctx)
+
+ 	if err != nil {
+		msg := "Unable to fetch the ACI metadata"
+		logger.WithError(err).Error(msg)
+		return err
+	}
+
+ 	if metadata == nil || metadata.GPURegionalSKUs == nil {
+		logger.Warn("ACI GPU capacity is not enabled. GPU capacity will be disabled")
+		return nil
+	}
+
+ 	for _, regionalSKU := range metadata.GPURegionalSKUs {
+		if strings.EqualFold(regionalSKU.Location, p.region) && len(regionalSKU.SKUs) != 0 {
+			p.gpu = "100"
+			if gpu := os.Getenv("ACI_QUOTA_GPU"); gpu != "" {
+				p.gpu = gpu
+			}
+			p.gpuSKUs = regionalSKU.SKUs
+		}
+	}
+
+ 	return nil
+}
+
 func (p *ACIProvider) setupNetworkProfile(auth *client.Authentication) error {
 	c, err := network.NewClient(auth, p.extraUserAgent)
 	if err != nil {
@@ -706,7 +749,7 @@ func (p *ACIProvider) GetPod(ctx context.Context, namespace, name string) (*v1.P
 	defer span.End()
 	ctx = addAzureAttributes(ctx, span, p)

-	cg, err, status := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, name))
+	cg, status, err := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, name))
 	if err != nil {
 		if status != nil && *status == http.StatusNotFound {
 			return nil, nil
@@ -728,7 +771,7 @@ func (p *ACIProvider) GetContainerLogs(ctx context.Context, namespace, podName,
 	ctx = addAzureAttributes(ctx, span, p)

 	logContent := ""
-	cg, err, _ := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, podName))
+	cg, _, err := p.aciClient.GetContainerGroup(ctx, p.resourceGroup, fmt.Sprintf("%s-%s", namespace, podName))
 	if err != nil {
 		return logContent, err
 	}
@@ -768,7 +811,7 @@ func (p *ACIProvider) ExecInContainer(name string, uid types.UID, container stri
 		defer errstream.Close()
 	}

-	cg, err, _ := p.aciClient.GetContainerGroup(context.TODO(), p.resourceGroup, name)
+	cg, _, err := p.aciClient.GetContainerGroup(context.TODO(), p.resourceGroup, name)
 	if err != nil {
 		return err
 	}
@@ -789,10 +832,10 @@ func (p *ACIProvider) ExecInContainer(name string, uid types.UID, container stri
 		return err
 	}

-	wsUri := xcrsp.WebSocketUri
+	wsURI  := xcrsp.WebSocketURI
 	password := xcrsp.Password

-	c, _, _ := websocket.DefaultDialer.Dial(wsUri, nil)
+	c, _, _ := websocket.DefaultDialer.Dial(wsURI, nil)
 	c.WriteMessage(websocket.TextMessage, []byte(password)) // Websocket password needs to be sent before WS terminal is active

 	// Cleanup on exit
@@ -889,11 +932,17 @@ func (p *ACIProvider) GetPods(ctx context.Context) ([]*v1.Pod, error) {

 // Capacity returns a resource list containing the capacity limits set for ACI.
 func (p *ACIProvider) Capacity(ctx context.Context) v1.ResourceList {
-	return v1.ResourceList{
-		"cpu":    resource.MustParse(p.cpu),
-		"memory": resource.MustParse(p.memory),
-		"pods":   resource.MustParse(p.pods),
+	resourceList := v1.ResourceList{
+		v1.ResourceCPU:    resource.MustParse(p.cpu),
+		v1.ResourceMemory: resource.MustParse(p.memory),
+		v1.ResourcePods:   resource.MustParse(p.pods),
 	}
+
+	if p.gpu != "" {
+		resourceList[gpuResourceName] = resource.MustParse(p.gpu)
+	}
+
+	return resourceList
 }

 // NodeConditions returns a list of conditions (Ready, OutOfDisk, etc), for updates to the node status
@@ -1146,7 +1195,7 @@ func (p *ACIProvider) getContainers(pod *v1.Pod) ([]aci.Container, error) {
 		}

 		c.Resources = aci.ResourceRequirements{
-			Requests: &aci.ResourceRequests{
+			Requests: &aci.ComputeResources{
 				CPU:        cpuRequest,
 				MemoryInGB: memoryRequest,
 			},
@@ -1163,10 +1212,29 @@ func (p *ACIProvider) getContainers(pod *v1.Pod) ([]aci.Container, error) {
 				memoryLimit = float64(container.Resources.Limits.Memory().Value()) / 1000000000.00
 			}

-			c.Resources.Limits = &aci.ResourceLimits{
+			c.Resources.Limits = &aci.ComputeResources{
 				CPU:        cpuLimit,
 				MemoryInGB: memoryLimit,
 			}
+
+			if gpu, ok := container.Resources.Limits[gpuResourceName]; ok {
+				sku, err := p.getGPUSKU(pod)
+				if err != nil {
+					return nil, err
+				}
+
+				if gpu.Value() == 0 {
+					return nil, errors.New("GPU must be a integer number")
+				}
+
+				gpuResource := &aci.GPUResource{
+					Count: int32(gpu.Value()),
+					SKU:   sku,
+				}
+
+				c.Resources.Requests.GPU = gpuResource
+				c.Resources.Limits.GPU = gpuResource
+			}
 		}

 		if container.LivenessProbe != nil {
@@ -1190,6 +1258,24 @@ func (p *ACIProvider) getContainers(pod *v1.Pod) ([]aci.Container, error) {
 	return containers, nil
 }

+func (p *ACIProvider) getGPUSKU(pod *v1.Pod) (aci.GPUSKU, error) {
+	if len(p.gpuSKUs) == 0 {
+		return "", fmt.Errorf("The pod requires GPU resource, but ACI doesn't provide GPU enabled container group in region %s", p.region)
+	}
+
+ 	if desiredSKU, ok := pod.Annotations[gpuTypeAnnotation]; ok {
+		for _, supportedSKU := range p.gpuSKUs {
+			if strings.EqualFold(string(desiredSKU), string(supportedSKU)) {
+				return supportedSKU, nil
+			}
+		}
+
+ 		return "", fmt.Errorf("The pod requires GPU SKU %s, but ACI only supports SKUs %v in region %s", desiredSKU, p.region, p.gpuSKUs)
+	}
+
+ 	return p.gpuSKUs[0], nil
+}
+
 func getProbe(probe *v1.Probe) (*aci.ContainerProbe, error) {

 	if probe.Handler.Exec != nil && probe.Handler.HTTPGet != nil {
@@ -1376,11 +1462,19 @@ func containerGroupToPod(cg *aci.ContainerGroup) (*v1.Pod, error) {
 			},
 		}

+		if c.Resources.Requests.GPU != nil {
+			container.Resources.Requests[gpuResourceName] = resource.MustParse(fmt.Sprintf("%d", c.Resources.Requests.GPU.Count))
+		}
+
 		if c.Resources.Limits != nil {
 			container.Resources.Limits = v1.ResourceList{
 				v1.ResourceCPU:    resource.MustParse(fmt.Sprintf("%g", c.Resources.Limits.CPU)),
 				v1.ResourceMemory: resource.MustParse(fmt.Sprintf("%gG", c.Resources.Limits.MemoryInGB)),
 			}
+
+			if c.Resources.Limits.GPU != nil {
+				container.Resources.Limits[gpuResourceName] = resource.MustParse(fmt.Sprintf("%d", c.Resources.Requests.GPU.Count))
+			}
 		}

 		containers = append(containers, container)
--- a/providers/azure/aciMock.go
+++ b/providers/azure/aciMock.go
@@ -16,12 +16,14 @@ type ACIMock struct {
 	OnCreate             func(string, string, string, *aci.ContainerGroup) (int, interface{})
 	OnGetContainerGroups func(string, string) (int, interface{})
 	OnGetContainerGroup  func(string, string, string) (int, interface{})
+	OnGetRPManifest      func() (int, interface{})
 }

 const (
 	containerGroupsRoute   = "/subscriptions/{subscriptionId}/resourceGroups/{resourceGroup}/providers/Microsoft.ContainerInstance/containerGroups"
 	containerGroupRoute    = containerGroupsRoute + "/{containerGroup}"
 	containerGroupLogRoute = containerGroupRoute + "/containers/{containerName}/logs"
+	resourceProviderRoute  = "/providers/Microsoft.ContainerInstance"
 )

 // NewACIMock creates a new Azure Container Instance mock server.
@@ -103,6 +105,22 @@ func (mock *ACIMock) start() {
 			w.WriteHeader(http.StatusNotImplemented)
 		}).Methods("GET")

+	router.HandleFunc(
+		resourceProviderRoute,
+		func(w http.ResponseWriter, r *http.Request) {
+			if mock.OnGetRPManifest != nil {
+				statusCode, response := mock.OnGetRPManifest()
+				w.WriteHeader(statusCode)
+				b := new(bytes.Buffer)
+				json.NewEncoder(b).Encode(response)
+				w.Write(b.Bytes())
+
+				return
+			}
+
+			w.WriteHeader(http.StatusNotImplemented)
+		}).Methods("GET")
+
 	mock.server = httptest.NewServer(router)
 }

--- a/providers/azure/aci_test.go
+++ b/providers/azure/aci_test.go
@@ -35,6 +35,7 @@ const (
 	fakeClientSecret  = "VGhpcyBpcyBhIHNlY3JldAo="
 	fakeTenantID      = "8cb81aca-83fe-4c6f-b667-4ec09c45a8bf"
 	fakeNodeName      = "vk"
+	fakeRegion        = "eastus"
 )

 // Test make registry credential
@@ -199,6 +200,166 @@ func TestCreatePodWithResourceRequestOnly(t *testing.T) {
 	}
 }

+// Tests create pod with default GPU SKU.
+func TestCreatePodWithGPU(t *testing.T) {
+	aadServerMocker := NewAADMock()
+	aciServerMocker := NewACIMock()
+
+ 	podName := "pod-" + uuid.New().String()
+	podNamespace := "ns-" + uuid.New().String()
+	gpuSKU := aci.GPUSKU("sku-" + uuid.New().String())
+
+ 	aciServerMocker.OnGetRPManifest = func() (int, interface{}) {
+		manifest := &aci.ResourceProviderManifest{
+			Metadata: &aci.ResourceProviderMetadata{
+				GPURegionalSKUs: []*aci.GPURegionalSKU{
+					&aci.GPURegionalSKU{
+						Location: fakeRegion,
+						SKUs: []aci.GPUSKU{gpuSKU, aci.K80, aci.P100},
+					},
+				},
+			},
+		}
+
+ 		return http.StatusOK, manifest
+	}
+
+ 	provider, err := createTestProvider(aadServerMocker, aciServerMocker)
+	if err != nil {
+		t.Fatalf("failed to create the test provider. %s", err.Error())
+		return
+	}
+
+ 	aciServerMocker.OnCreate = func(subscription, resourceGroup, containerGroup string, cg *aci.ContainerGroup) (int, interface{}) {
+		assert.Check(t, is.Equal(fakeSubscription, subscription), "Subscription doesn't match")
+		assert.Check(t, is.Equal(fakeResourceGroup, resourceGroup), "Resource group doesn't match")
+		assert.Check(t, is.Equal(podNamespace+"-"+podName, containerGroup), "Container group name is not expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers != nil, "Containers should not be nil")
+		assert.Check(t, is.Equal(1, len(cg.ContainerGroupProperties.Containers)), "1 Container is expected")
+		assert.Check(t, is.Equal("nginx", cg.ContainerGroupProperties.Containers[0].Name), "Container nginx is expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests != nil, "Container resource requests should not be nil")
+		assert.Check(t, is.Equal(1.98, cg.ContainerGroupProperties.Containers[0].Resources.Requests.CPU), "Request CPU is not expected")
+		assert.Check(t, is.Equal(3.4, cg.ContainerGroupProperties.Containers[0].Resources.Requests.MemoryInGB), "Request Memory is not expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU != nil, "Requests GPU is not expected")
+		assert.Check(t, is.Equal(int32(10), cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.Count), "Requests GPU Count is not expected")
+		assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.SKU), "Requests GPU SKU is not expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU != nil, "Limits GPU is not expected")
+		assert.Check(t, is.Equal(int32(10), cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.Count), "Requests GPU Count is not expected")
+		assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.SKU), "Requests GPU SKU is not expected")
+
+ 		return http.StatusOK, cg
+	}
+
+ 	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      podName,
+			Namespace: podNamespace,
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				v1.Container{
+					Name: "nginx",
+					Resources: v1.ResourceRequirements{
+						Requests: v1.ResourceList{
+							"cpu":    resource.MustParse("1.981"),
+							"memory": resource.MustParse("3.49G"),
+						},
+						Limits: v1.ResourceList{
+							gpuResourceName: resource.MustParse("10"),
+						},
+					},
+				},
+			},
+		},
+	}
+
+ 	if err := provider.CreatePod(context.Background(), pod); err != nil {
+		t.Fatal("Failed to create pod", err)
+	}
+}
+
+ // Tests create pod with GPU SKU in annotation.
+func TestCreatePodWithGPUSKU(t *testing.T) {
+	aadServerMocker := NewAADMock()
+	aciServerMocker := NewACIMock()
+
+ 	podName := "pod-" + uuid.New().String()
+	podNamespace := "ns-" + uuid.New().String()
+	gpuSKU := aci.GPUSKU("sku-" + uuid.New().String())
+
+ 	aciServerMocker.OnGetRPManifest = func() (int, interface{}) {
+		manifest := &aci.ResourceProviderManifest{
+			Metadata: &aci.ResourceProviderMetadata{
+				GPURegionalSKUs: []*aci.GPURegionalSKU{
+					&aci.GPURegionalSKU{
+						Location: fakeRegion,
+						SKUs: []aci.GPUSKU{aci.K80, aci.P100, gpuSKU},
+					},
+				},
+			},
+		}
+
+ 		return http.StatusOK, manifest
+	}
+
+ 	provider, err := createTestProvider(aadServerMocker, aciServerMocker)
+	if err != nil {
+		t.Fatalf("failed to create the test provider. %s", err.Error())
+		return
+	}
+
+ 	aciServerMocker.OnCreate = func(subscription, resourceGroup, containerGroup string, cg *aci.ContainerGroup) (int, interface{}) {
+		assert.Check(t, is.Equal(fakeSubscription, subscription), "Subscription doesn't match")
+		assert.Check(t, is.Equal(fakeResourceGroup, resourceGroup), "Resource group doesn't match")
+		assert.Check(t, cg != nil, "Container group is nil")
+		assert.Check(t, is.Equal(podNamespace+"-"+podName, containerGroup), "Container group name is not expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers != nil, "Containers should not be nil")
+		assert.Check(t, is.Equal(1, len(cg.ContainerGroupProperties.Containers)), "1 Container is expected")
+		assert.Check(t, is.Equal("nginx", cg.ContainerGroupProperties.Containers[0].Name), "Container nginx is expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests != nil, "Container resource requests should not be nil")
+		assert.Check(t, is.Equal(1.98, cg.ContainerGroupProperties.Containers[0].Resources.Requests.CPU), "Request CPU is not expected")
+		assert.Check(t, is.Equal(3.4, cg.ContainerGroupProperties.Containers[0].Resources.Requests.MemoryInGB), "Request Memory is not expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU != nil, "Requests GPU is not expected")
+		assert.Check(t, is.Equal(int32(1), cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.Count), "Requests GPU Count is not expected")
+		assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Requests.GPU.SKU), "Requests GPU SKU is not expected")
+		assert.Check(t, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU != nil, "Limits GPU is not expected")
+		assert.Check(t, is.Equal(int32(1), cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.Count), "Requests GPU Count is not expected")
+		assert.Check(t, is.Equal(gpuSKU, cg.ContainerGroupProperties.Containers[0].Resources.Limits.GPU.SKU), "Requests GPU SKU is not expected")
+
+ 		return http.StatusOK, cg
+	}
+
+ 	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      podName,
+			Namespace: podNamespace,
+			Annotations: map[string]string{
+				gpuTypeAnnotation: string(gpuSKU),
+			},
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				v1.Container{
+					Name: "nginx",
+					Resources: v1.ResourceRequirements{
+						Requests: v1.ResourceList{
+							"cpu":    resource.MustParse("1.981"),
+							"memory": resource.MustParse("3.49G"),
+						},
+						Limits: v1.ResourceList{
+							gpuResourceName: resource.MustParse("1"),
+						},
+					},
+				},
+			},
+		},
+	}
+
+ 	if err := provider.CreatePod(context.Background(), pod); err != nil {
+		t.Fatal("Failed to create pod", err)
+	}
+}
+
 // Tests create pod with both resource request and limit.
 func TestCreatePodWithResourceRequestAndLimit(t *testing.T) {
 	_, aciServerMocker, provider, err := prepareMocks()
@@ -314,7 +475,7 @@ func TestGetPodsWithoutResourceRequestsLimits(t *testing.T) {
 								},
 							},
 							Resources: aci.ResourceRequirements{
-								Requests: &aci.ResourceRequests{
+								Requests: &aci.ComputeResources{
 									CPU:        0.99,
 									MemoryInGB: 1.5,
 								},
@@ -385,7 +546,7 @@ func TestGetPodWithoutResourceRequestsLimits(t *testing.T) {
 								},
 							},
 							Resources: aci.ResourceRequirements{
-								Requests: &aci.ResourceRequests{
+								Requests: &aci.ComputeResources{
 									CPU:        0.99,
 									MemoryInGB: 1.5,
 								},
@@ -412,6 +573,94 @@ func TestGetPodWithoutResourceRequestsLimits(t *testing.T) {
 		pod.Spec.Containers[0].Resources.Requests.Memory().Value()), "Containers[0].Resources.Requests.Memory doesn't match")
 }

+// Tests get pod with GPU.
+func TestGetPodWithGPU(t *testing.T) {
+	_, aciServerMocker, provider, err := prepareMocks()
+
+ 	if err != nil {
+		t.Fatal("Unable to prepare the mocks", err)
+	}
+
+ 	podName := "pod-" + uuid.New().String()
+	podNamespace := "ns-" + uuid.New().String()
+
+ 	aciServerMocker.OnGetContainerGroup = func(subscription, resourceGroup, containerGroup string) (int, interface{}) {
+		assert.Equal(t, fakeSubscription, subscription, "Subscription doesn't match")
+		assert.Equal(t, fakeResourceGroup, resourceGroup, "Resource group doesn't match")
+		assert.Equal(t, podNamespace+"-"+podName, containerGroup, "Container group name is not expected")
+
+ 		return http.StatusOK, aci.ContainerGroup{
+			Tags: map[string]string{
+				"NodeName": fakeNodeName,
+			},
+			ContainerGroupProperties: aci.ContainerGroupProperties{
+				ProvisioningState: "Creating",
+				Containers: []aci.Container{
+					aci.Container{
+						Name: "nginx",
+						ContainerProperties: aci.ContainerProperties{
+							Image:   "nginx",
+							Command: []string{"nginx", "-g", "daemon off;"},
+							Ports: []aci.ContainerPort{
+								{
+									Protocol: aci.ContainerNetworkProtocolTCP,
+									Port:     80,
+								},
+							},
+							Resources: aci.ResourceRequirements{
+								Requests: &aci.ComputeResources{
+									CPU:        0.99,
+									MemoryInGB: 1.5,
+									GPU: &aci.GPUResource{
+										Count: 5,
+										SKU:   aci.P100,
+									},
+								},
+								Limits: &aci.ComputeResources{
+									GPU: &aci.GPUResource{
+										Count: 5,
+										SKU:   aci.P100,
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+		}
+	}
+
+ 	pod, err := provider.GetPod(context.Background(), podNamespace, podName)
+	if err != nil {
+		t.Fatal("Failed to get pod", err)
+	}
+
+ 	assert.Check(t, pod != nil, "Response pod should not be nil")
+	assert.Check(t, pod.Spec.Containers != nil, "Containers should not be nil")
+	assert.Check(t, pod.Spec.Containers[0].Resources.Requests != nil, "Containers[0].Resources.Requests should not be nil")
+	assert.Check(
+		t,
+		is.Equal(ptrQuantity(resource.MustParse("0.99")).Value(), pod.Spec.Containers[0].Resources.Requests.Cpu().Value()),
+		"Containers[0].Resources.Requests.CPU doesn't match")
+	assert.Check(
+		t,
+		is.Equal(ptrQuantity(resource.MustParse("1.5G")).Value(), pod.Spec.Containers[0].Resources.Requests.Memory().Value()),
+		"Containers[0].Resources.Requests.Memory doesn't match")
+	gpuQuantity, ok := pod.Spec.Containers[0].Resources.Requests[gpuResourceName]
+	assert.Check(t, is.Equal(ok, true), "Containers[0].Resources.Requests.GPU should not be nil")
+	assert.Check(
+		t,
+		is.Equal(ptrQuantity(resource.MustParse("5")).Value(), ptrQuantity(gpuQuantity).Value()),
+		"Containers[0].Resources.Requests.GPU.Count doesn't match")
+	assert.Check(t, pod.Spec.Containers[0].Resources.Limits != nil, "Containers[0].Resources.Limits should not be nil")
+	gpuQuantity, ok = pod.Spec.Containers[0].Resources.Limits[gpuResourceName]
+	assert.Check(t, is.Equal(ok, true), "Containers[0].Resources.Requests.GPU should not be nil")
+	assert.Check(
+		t,
+		is.Equal(ptrQuantity(resource.MustParse("5")).Value(), ptrQuantity(gpuQuantity).Value()),
+		"Containers[0].Resources.Limits.GPU.Count doesn't match")
+}
+
 func TestGetPodWithContainerID(t *testing.T) {
 	_, aciServerMocker, provider, err := prepareMocks()

@@ -451,7 +700,7 @@ func TestGetPodWithContainerID(t *testing.T) {
 								},
 							},
 							Resources: aci.ResourceRequirements{
-								Requests: &aci.ResourceRequests{
+								Requests: &aci.ComputeResources{
 									CPU:        0.99,
 									MemoryInGB: 1.5,
 								},
@@ -531,6 +780,30 @@ func prepareMocks() (*AADMock, *ACIMock, *ACIProvider, error) {
 	aadServerMocker := NewAADMock()
 	aciServerMocker := NewACIMock()

+	aciServerMocker.OnGetRPManifest = func() (int, interface{}) {
+		manifest := &aci.ResourceProviderManifest{
+			Metadata: &aci.ResourceProviderMetadata{
+				GPURegionalSKUs: []*aci.GPURegionalSKU{
+					&aci.GPURegionalSKU{
+						Location: fakeRegion,
+						SKUs: []aci.GPUSKU{aci.K80, aci.P100, aci.V100},
+					},
+				},
+			},
+		}
+
+		return http.StatusOK, manifest
+	}
+
+	provider, err := createTestProvider(aadServerMocker, aciServerMocker)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("failed to create the test provider %s", err.Error())
+	}
+
+	return aadServerMocker, aciServerMocker, provider, nil
+}
+
+func createTestProvider(aadServerMocker *AADMock, aciServerMocker*ACIMock) (*ACIProvider, error) {
 	auth := azure.NewAuthentication(
 		azure.PublicCloud.Name,
 		fakeClientID,
@@ -543,7 +816,7 @@ func prepareMocks() (*AADMock, *ACIMock, *ACIProvider, error) {

 	file, err := ioutil.TempFile("", "auth.json")
 	if err != nil {
-		return nil, nil, nil, err
+		return nil, err
 	}

 	defer os.Remove(file.Name())
@@ -552,23 +825,25 @@ func prepareMocks() (*AADMock, *ACIMock, *ACIProvider, error) {
 	json.NewEncoder(b).Encode(auth)

 	if _, err := file.Write(b.Bytes()); err != nil {
-		return nil, nil, nil, err
+		return nil, err
 	}

 	os.Setenv("AZURE_AUTH_LOCATION", file.Name())
 	os.Setenv("ACI_RESOURCE_GROUP", fakeResourceGroup)
+	os.Setenv("ACI_REGION", fakeRegion)

 	rm, err := manager.NewResourceManager(nil, nil, nil)
+
 	if err != nil {
-		return nil, nil, nil, err
+		return nil, err
 	}

 	provider, err := NewACIProvider("example.toml", rm, fakeNodeName, "Linux", "0.0.0.0", 10250)
 	if err != nil {
-		return nil, nil, nil, err
+		return nil, err
 	}

-	return aadServerMocker, aciServerMocker, provider, nil
+	return provider, nil
 }

 func ptrQuantity(q resource.Quantity) *resource.Quantity {
--- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/analytics.go
+++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/analytics.go
@@ -7,6 +7,7 @@ import (
 	"io/ioutil"
 )

+// NewContainerGroupDiagnostics creates a container group diagnostics object
 func NewContainerGroupDiagnostics(logAnalyticsID, logAnalyticsKey string) (*ContainerGroupDiagnostics, error) {

 	if logAnalyticsID == "" || logAnalyticsKey == "" {
@@ -21,6 +22,7 @@ func NewContainerGroupDiagnostics(logAnalyticsID, logAnalyticsKey string) (*Cont
 	}, nil
 }

+// NewContainerGroupDiagnosticsFromFile creates a container group diagnostics object from the specified file
 func NewContainerGroupDiagnosticsFromFile(filepath string) (*ContainerGroupDiagnostics, error) {

 	analyticsdata, err := ioutil.ReadFile(filepath)
--- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/client.go
+++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/client.go
@@ -14,8 +14,8 @@ import (
 const (
 	// BaseURI is the default URI used for compute services.
 	baseURI          = "https://management.azure.com"
-	defaultUserAgent = "virtual-kubelet/azure-arm-aci/2018-09-01"
-	apiVersion       = "2018-09-01"
+	defaultUserAgent = "virtual-kubelet/azure-arm-aci/2018-10-01"
+	apiVersion       = "2018-10-01"

 	containerGroupURLPath                    = "subscriptions/{{.subscriptionId}}/resourceGroups/{{.resourceGroup}}/providers/Microsoft.ContainerInstance/containerGroups/{{.containerGroupName}}"
 	containerGroupListURLPath                = "subscriptions/{{.subscriptionId}}/providers/Microsoft.ContainerInstance/containerGroups"
--- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/exec.go
+++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/exec.go
@@ -11,12 +11,13 @@ import (
 	"github.com/virtual-kubelet/azure-aci/client/api"
 )

+// TerminalSizeRequest is the terminal size request
 type TerminalSizeRequest struct {
 	Width  int
 	Height int
 }

-// Starts the exec command for a specified container instance in a specified resource group and container group.
+// LaunchExec starts the exec command for a specified container instance in a specified resource group and container group.
 // From: https://docs.microsoft.com/en-us/rest/api/container-instances/startcontainer/launchexec
 func (c *Client) LaunchExec(resourceGroup, containerGroupName, containerName, command string, terminalSize TerminalSizeRequest) (ExecResponse, error) {
 	urlParams := url.Values{
@@ -35,7 +36,7 @@ func (c *Client) LaunchExec(resourceGroup, containerGroupName, containerName, co

 	var xcrsp ExecResponse
 	xcrsp.Password = ""
-	xcrsp.WebSocketUri = ""
+	xcrsp.WebSocketURI = ""

 	b := new(bytes.Buffer)

--- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/get.go
+++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/get.go
@@ -14,7 +14,7 @@ import (
 // GetContainerGroup gets an Azure Container Instance in the provided
 // resource group with the given container group name.
 // From: https://docs.microsoft.com/en-us/rest/api/container-instances/containergroups/get
-func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, containerGroupName string) (*ContainerGroup, error, *int) {
+func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, containerGroupName string) (*ContainerGroup, *int, error) {
 	urlParams := url.Values{
 		"api-version": []string{apiVersion},
 	}
@@ -26,7 +26,7 @@ func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, container
 	// Create the request.
 	req, err := http.NewRequest("GET", uri, nil)
 	if err != nil {
-		return nil, fmt.Errorf("Creating get container group uri request failed: %v", err), nil
+		return nil, nil, fmt.Errorf("Creating get container group uri request failed: %v", err)
 	}
 	req = req.WithContext(ctx)

@@ -36,29 +36,29 @@ func (c *Client) GetContainerGroup(ctx context.Context, resourceGroup, container
 		"resourceGroup":      resourceGroup,
 		"containerGroupName": containerGroupName,
 	}); err != nil {
-		return nil, fmt.Errorf("Expanding URL with parameters failed: %v", err), nil
+		return nil, nil, fmt.Errorf("Expanding URL with parameters failed: %v", err)
 	}

 	// Send the request.
 	resp, err := c.hc.Do(req)
 	if err != nil {
-		return nil, fmt.Errorf("Sending get container group request failed: %v", err), nil
+		return nil, nil, fmt.Errorf("Sending get container group request failed: %v", err)
 	}
 	defer resp.Body.Close()

 	// 200 (OK) is a success response.
 	if err := api.CheckResponse(resp); err != nil {
-		return nil, err, &resp.StatusCode
+		return nil, &resp.StatusCode, err
 	}

 	// Decode the body from the response.
 	if resp.Body == nil {
-		return nil, errors.New("Get container group returned an empty body in the response"), &resp.StatusCode
+		return nil, &resp.StatusCode, errors.New("Get container group returned an empty body in the response")
 	}
 	var cg ContainerGroup
 	if err := json.NewDecoder(resp.Body).Decode(&cg); err != nil {
-		return nil, fmt.Errorf("Decoding get container group response body failed: %v", err), &resp.StatusCode
+		return nil, &resp.StatusCode, fmt.Errorf("Decoding get container group response body failed: %v", err)
 	}

-	return &cg, nil, &resp.StatusCode
+	return &cg, &resp.StatusCode, nil
 }
--- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/rp.go
+++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/rp.go
@@ -0,0 +1,76 @@
+package aci
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"net/http"
+	"net/url"
+
+	"github.com/virtual-kubelet/azure-aci/client/api"
+)
+
+const (
+	resourceProviderURLPath    = "providers/Microsoft.ContainerInstance"
+	resourceProviderAPIVersion = "2018-02-01"
+)
+
+// GetResourceProviderMetadata gets the ACI resource provider metadata
+func (c *Client) GetResourceProviderMetadata(ctx context.Context) (*ResourceProviderMetadata, error) {
+	manifest, err := c.getResourceProviderManifest(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	if manifest == nil {
+		return nil, fmt.Errorf("The resource provider manifest is empty")
+	}
+
+	if manifest.Metadata == nil {
+		return nil, fmt.Errorf("The resource provider metadata is empty")
+	}
+
+	return manifest.Metadata, nil
+}
+
+func (c *Client) getResourceProviderManifest(ctx context.Context) (*ResourceProviderManifest, error) {
+	urlParams := url.Values{
+		"api-version": []string{resourceProviderAPIVersion},
+		"$expand":     []string{"metadata"},
+	}
+
+	// Create the url.
+	uri := api.ResolveRelative(c.auth.ResourceManagerEndpoint, resourceProviderURLPath)
+	uri += "?" + url.Values(urlParams).Encode()
+
+	// Create the request.
+	req, err := http.NewRequest("GET", uri, nil)
+	if err != nil {
+		return nil, fmt.Errorf("Creating get resource provider manifest request failed: %v", err)
+	}
+	req = req.WithContext(ctx)
+
+	// Send the request.
+	resp, err := c.hc.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("Sending get resource provider manifest request failed: %v", err)
+	}
+	defer resp.Body.Close()
+
+	// 200 (OK) is a success response.
+	if err := api.CheckResponse(resp); err != nil {
+		return nil, err
+	}
+
+	// Decode the body from the response.
+	if resp.Body == nil {
+		return nil, errors.New("Get resource provider manifest returned an empty body in the response")
+	}
+	var manifest ResourceProviderManifest
+	if err := json.NewDecoder(resp.Body).Decode(&manifest); err != nil {
+		return nil, fmt.Errorf("Decoding get resource provider manifest response body failed: %v", err)
+	}
+
+	return &manifest, nil
+}
--- a/vendor/github.com/virtual-kubelet/azure-aci/client/aci/types.go
+++ b/vendor/github.com/virtual-kubelet/azure-aci/client/aci/types.go
@@ -237,22 +237,35 @@ type Resource struct {
 	Tags     map[string]string `json:"tags,omitempty"`
 }

-// ResourceLimits is the resource limits.
-type ResourceLimits struct {
-	MemoryInGB float64 `json:"memoryInGB,omitempty"`
-	CPU        float64 `json:"cpu,omitempty"`
+// GPUSKU enumerates the values for GPU SKUs
+type GPUSKU string
+
+const (
+	// K80 specifies the K80 GPU SKU
+	K80  GPUSKU = "K80"
+	// P100 specifies the P100 GPU SKU
+	P100 GPUSKU = "P100"
+	// V100 specifies the V100 GPU SKU
+	V100 GPUSKU = "V100"
+)
+
+// GPUResource is the GPU resource for the container group.
+type GPUResource struct {
+	Count int32  `json:"count"`
+	SKU   GPUSKU `json:"sku"`
 }

-// ResourceRequests is the resource requests.
-type ResourceRequests struct {
-	MemoryInGB float64 `json:"memoryInGB,omitempty"`
-	CPU        float64 `json:"cpu,omitempty"`
+// ComputeResources is the compute resource.
+type ComputeResources struct {
+	MemoryInGB float64      `json:"memoryInGB,omitempty"`
+	CPU        float64      `json:"cpu,omitempty"`
+	GPU        *GPUResource `json:"gpu,omitempty"`
 }

 // ResourceRequirements is the resource requirements.
 type ResourceRequirements struct {
-	Requests *ResourceRequests `json:"requests,omitempty"`
-	Limits   *ResourceLimits   `json:"limits,omitempty"`
+	Requests *ComputeResources `json:"requests,omitempty"`
+	Limits   *ComputeResources `json:"limits,omitempty"`
 }

 // Usage is a single usage result
@@ -305,7 +318,7 @@ type ExecRequest struct {

 // ExecResponse is a request for Launch Exec API response for ACI.
 type ExecResponse struct {
-	WebSocketUri string `json:"webSocketUri,omitempty"`
+	WebSocketURI string `json:"webSocketUri,omitempty"`
 	Password     string `json:"password,omitempty"`
 }

@@ -488,3 +501,20 @@ const (
 	LogAnalyticsMetadataKeyNodeName          string = "node-name"
 	LogAnalyticsMetadataKeyClusterResourceID string = "cluster-resource-id"
 )
+
+// GPURegionalSKU is the ACI GPU regional SKU
+type GPURegionalSKU struct {
+	Location string   `json:"location"`
+	SKUs     []GPUSKU `json:"skus"`
+}
+
+// ResourceProviderMetadata is the ACI resource provider metadata
+type ResourceProviderMetadata struct {
+	VNetSupportRegions []string          `json:"vnetSupportRegions,omitempty"`
+	GPURegionalSKUs    []*GPURegionalSKU `json:"gpuRegionalSkus,omitempty"`
+}
+
+// ResourceProviderManifest is the ACI resource provider manifest
+type ResourceProviderManifest struct {
+	Metadata *ResourceProviderMetadata `json:"metadata"`
+}