Create a provider to use Azure Batch (#133)

* Started work on provider

* WIP Adding batch provider

* Working basic call into pool client. Need to parameterize the baseurl

* Fixed job creation by manipulating the content-type

* WIP Kicking off containers. Dirty

* [wip] More meat around scheduling simple containers.

* Working on basic task wrapper to co-schedule pods

* WIP on task wrapper

* WIP

* Working pod minimal wrapper for batch

* Integrate pod template code into provider

* Cleaning up

* Move to docker without gpu

* WIP batch integration

* partially working

* Working logs

* Tidy code

* WIP: Testing and readme

* Added readme and terraform deployment for GPU Azure Batch pool.

* Update to enable low priority nodes for gpu

* Fix log formatting bug. Return node logs when container not yet started

* Moved to golang v1.10

* Fix cri test

* Fix up minor docs Issue. Add provider to readme. Add var for vk image.
This commit is contained in:
Lawrence Gripper
2018-06-23 00:33:49 +01:00
committed by Robbie Zhang
parent 1ad6fb434e
commit d6e8b3daf7
75 changed files with 20040 additions and 6 deletions

View File

@@ -0,0 +1,59 @@
resource "random_id" "workspace" {
keepers = {
# Generate a new id each time we switch to a new resource group
group_name = "${var.resource_group_name}"
}
byte_length = 8
}
#an attempt to keep the AKS name (and dns label) somewhat unique
resource "random_integer" "random_int" {
min = 100
max = 999
}
resource "azurerm_kubernetes_cluster" "aks" {
name = "aks-${random_integer.random_int.result}"
location = "${var.resource_group_location}"
dns_prefix = "aks-${random_integer.random_int.result}"
resource_group_name = "${var.resource_group_name}"
kubernetes_version = "1.9.2"
linux_profile {
admin_username = "${var.linux_admin_username}"
ssh_key {
key_data = "${var.linux_admin_ssh_publickey}"
}
}
agent_pool_profile {
name = "agentpool"
count = "2"
vm_size = "Standard_DS2_v2"
os_type = "Linux"
}
service_principal {
client_id = "${var.client_id}"
client_secret = "${var.client_secret}"
}
}
output "cluster_client_certificate" {
value = "${base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.client_certificate)}"
}
output "cluster_client_key" {
value = "${base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.client_key)}"
}
output "cluster_ca" {
value = "${base64decode(azurerm_kubernetes_cluster.aks.kube_config.0.cluster_ca_certificate)}"
}
output "host" {
value = "${azurerm_kubernetes_cluster.aks.kube_config.0.host}"
}

View File

@@ -0,0 +1,31 @@
variable "client_id" {
type = "string"
description = "Client ID"
}
variable "client_secret" {
type = "string"
description = "Client secret."
}
variable "resource_group_name" {
type = "string"
description = "Name of the azure resource group."
default = "akc-rg"
}
variable "resource_group_location" {
type = "string"
description = "Location of the azure resource group."
default = "eastus"
}
variable "linux_admin_username" {
type = "string"
description = "User name for authentication to the Kubernetes linux agent virtual machines in the cluster."
}
variable "linux_admin_ssh_publickey" {
type = "string"
description = "Configure all the linux virtual machines in the cluster with the SSH RSA public key string. The key should include three parts, for example 'ssh-rsa AAAAB...snip...UcyupgH azureuser@linuxvm'"
}

View File

@@ -0,0 +1,146 @@
resource "random_string" "batchname" {
keepers = {
# Generate a new id each time we switch to a new resource group
group_name = "${var.resource_group_name}"
}
length = 8
upper = false
special = false
number = false
}
resource "azurerm_template_deployment" "test" {
name = "tfdeployment"
resource_group_name = "${var.resource_group_name}"
# these key-value pairs are passed into the ARM Template's `parameters` block
parameters {
"batchAccountName" = "${random_string.batchname.result}"
"storageAccountID" = "${var.storage_account_id}"
"poolBoostrapScriptUrl" = "${var.pool_bootstrap_script_url}"
"location" = "${var.resource_group_location}"
"poolID" = "${var.pool_id}"
"vmSku" = "${var.vm_sku}"
"lowPriorityNodeCount" = "${var.low_priority_node_count}"
"dedicatedNodeCount" = "${var.dedicated_node_count}"
}
deployment_mode = "Incremental"
template_body = <<DEPLOY
{
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"batchAccountName": {
"type": "string",
"metadata": {
"description": "Batch Account Name"
}
},
"poolID": {
"type": "string",
"metadata": {
"description": "GPU Pool ID"
}
},
"dedicatedNodeCount": {
"type": "string"
},
"lowPriorityNodeCount": {
"type": "string"
},
"vmSku": {
"type": "string"
},
"storageAccountID": {
"type": "string"
},
"poolBoostrapScriptUrl": {
"type": "string"
},
"location": {
"type": "string",
"defaultValue": "[resourceGroup().location]",
"metadata": {
"description": "Location for all resources."
}
}
},
"resources": [
{
"type": "Microsoft.Batch/batchAccounts",
"name": "[parameters('batchAccountName')]",
"apiVersion": "2015-12-01",
"location": "[parameters('location')]",
"tags": {
"ObjectName": "[parameters('batchAccountName')]"
},
"properties": {
"autoStorage": {
"storageAccountId": "[parameters('storageAccountID')]"
}
}
},
{
"type": "Microsoft.Batch/batchAccounts/pools",
"name": "[concat(parameters('batchAccountName'), '/', parameters('poolID'))]",
"apiVersion": "2017-09-01",
"scale": null,
"properties": {
"vmSize": "STANDARD_NC6",
"interNodeCommunication": "Disabled",
"maxTasksPerNode": 1,
"taskSchedulingPolicy": {
"nodeFillType": "Spread"
},
"startTask": {
"commandLine": "/bin/bash -c ./init.sh",
"resourceFiles": [
{
"blobSource": "[parameters('poolBoostrapScriptUrl')]",
"fileMode": "777",
"filePath": "./init.sh"
}
],
"userIdentity": {
"autoUser": {
"elevationLevel": "Admin",
"scope": "Pool"
}
},
"waitForSuccess": true,
"maxTaskRetryCount": 0
},
"deploymentConfiguration": {
"virtualMachineConfiguration": {
"imageReference": {
"publisher": "Canonical",
"offer": "UbuntuServer",
"sku": "16.04-LTS",
"version": "latest"
},
"nodeAgentSkuId": "batch.node.ubuntu 16.04"
}
},
"scaleSettings": {
"fixedScale": {
"targetDedicatedNodes": "[parameters('dedicatedNodeCount')]",
"targetLowPriorityNodes": "[parameters('lowPriorityNodeCount')]",
"resizeTimeout": "PT15M"
}
}
},
"dependsOn": [
"[resourceId('Microsoft.Batch/batchAccounts', parameters('batchAccountName'))]"
]
}
]
}
DEPLOY
}
output "name" {
value = "${random_string.batchname.result}"
}

View File

@@ -0,0 +1,43 @@
variable "pool_id" {
type = "string"
description = "Name of the Azure Batch pool to create."
default = "pool1"
}
variable "vm_sku" {
type = "string"
description = "VM SKU to use - Default to NC6 GPU SKU."
default = "STANDARD_NC6"
}
variable "pool_bootstrap_script_url" {
type = "string"
description = "Publicly accessible url used for boostrapping nodes in the pool. Installing GPU drivers, for example."
}
variable "storage_account_id" {
type = "string"
description = "Name of the storage account to be used by Azure Batch"
}
variable "resource_group_name" {
type = "string"
description = "Name of the azure resource group."
default = "akc-rg"
}
variable "resource_group_location" {
type = "string"
description = "Location of the azure resource group."
default = "eastus"
}
variable "low_priority_node_count" {
type = "string"
description = "The number of low priority nodes to allocate to the pool"
}
variable "dedicated_node_count" {
type = "string"
description = "The number dedicated nodes to allocate to the pool"
}

View File

@@ -0,0 +1,19 @@
apiVersion: v1
kind: Pod
metadata:
name: cuda-vector-add
labels:
app: examplegpupod
spec:
restartPolicy: OnFailure
containers:
- name: cuda-vector-add
# https://github.com/kubernetes/kubernetes/blob/v1.7.11/test/images/nvidia-cuda/Dockerfile
image: "k8s.gcr.io/cuda-vector-add:v0.1"
resources:
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
nodeName: virtual-kubelet
tolerations:
- key: azure.com/batch
effect: NoSchedule

View File

@@ -0,0 +1,20 @@
apiVersion: v1
kind: Pod
metadata:
name: exampegpujob
spec:
containers:
- image: nvidia/cuda
command: ["nvidia-smi"]
imagePullPolicy: Always
name: nvidia
resources:
requests:
memory: 1G
cpu: 1
limits:
nvidia.com/gpu: 1 # requesting 1 GPU
nodeName: virtual-kubelet
tolerations:
- key: azure.com/batch
effect: NoSchedule

View File

@@ -0,0 +1,53 @@
resource "azurerm_resource_group" "batchrg" {
name = "${var.resource_group_name}"
location = "${var.resource_group_location}"
}
module "aks" {
source = "aks"
//Defaults to using current ssh key: recomend changing as needed
linux_admin_username = "aks"
linux_admin_ssh_publickey = "${file("~/.ssh/id_rsa.pub")}"
client_id = "${var.client_id}"
client_secret = "${var.client_secret}"
resource_group_name = "${azurerm_resource_group.batchrg.name}"
resource_group_location = "${azurerm_resource_group.batchrg.location}"
}
module "storage" {
source = "storage"
pool_bootstrap_script_path = "./scripts/poolstartup.sh"
resource_group_name = "${azurerm_resource_group.batchrg.name}"
resource_group_location = "${azurerm_resource_group.batchrg.location}"
}
module "azurebatch" {
source = "azurebatch"
storage_account_id = "${module.storage.id}"
pool_bootstrap_script_url = "${module.storage.pool_boostrap_script_url}"
resource_group_name = "${azurerm_resource_group.batchrg.name}"
resource_group_location = "${azurerm_resource_group.batchrg.location}"
dedicated_node_count = 1
low_priority_node_count = 2
}
module "virtualkubelet" {
source = "virtualkubelet"
virtualkubelet_docker_image = "${var.virtualkubelet_docker_image}"
cluster_client_key = "${module.aks.cluster_client_key}"
cluster_client_certificate = "${module.aks.cluster_client_certificate}"
cluster_ca = "${module.aks.cluster_ca}"
cluster_host = "${module.aks.host}"
azure_batch_account_name = "${module.azurebatch.name}"
resource_group_location = "${azurerm_resource_group.batchrg.location}"
}

View File

@@ -0,0 +1,49 @@
export DEBIAN_FRONTEND=noninteractive
export TEMP_DISK=/mnt
apt-get install -y -q --no-install-recommends \
build-essential
# Add dockerce repo
apt-get update -y -q --no-install-recommends
apt-get install -y -q -o Dpkg::Options::="--force-confnew" --no-install-recommends \
apt-transport-https ca-certificates curl software-properties-common cgroup-lite
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
apt-get update
#Install latest cuda driver..
CUDA_REPO_PKG=cuda-repo-ubuntu1604_9.1.85-1_amd64.deb
wget -O /tmp/${CUDA_REPO_PKG} http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/${CUDA_REPO_PKG}
sudo dpkg -i /tmp/${CUDA_REPO_PKG}
sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
rm -f /tmp/${CUDA_REPO_PKG}
sudo apt-get update -y -q --no-install-recommends
sudo apt-get install cuda-drivers -y -q --no-install-recommends
# install nvidia-docker
curl -fSsL https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
curl -fSsL https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | \
tee /etc/apt/sources.list.d/nvidia-docker.list
apt-get update -y -q --no-install-recommends
apt-get install -y -q --no-install-recommends -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confnew" nvidia-docker2
systemctl restart docker.service
nvidia-docker version
# prep docker
systemctl stop docker.service
rm -rf /var/lib/docker
mkdir -p /etc/docker
mkdir -p $TEMPDISK/docker
chmod 777 $TEMPDISK/docker
echo "{ \"data-root\": \"$TEMP_DISK/docker\", \"hosts\": [ \"unix:///var/run/docker.sock\", \"tcp://127.0.0.1:2375\" ] }" > /etc/docker/daemon.json.merge
python -c "import json;a=json.load(open('/etc/docker/daemon.json.merge'));b=json.load(open('/etc/docker/daemon.json'));a.update(b);f=open('/etc/docker/daemon.json','w');json.dump(a,f);f.close();"
rm -f /etc/docker/daemon.json.merge
sed -i 's|^ExecStart=/usr/bin/dockerd.*|ExecStart=/usr/bin/dockerd|' /lib/systemd/system/docker.service
systemctl daemon-reload
systemctl start docker.service

View File

@@ -0,0 +1,77 @@
resource "random_string" "storage" {
keepers = {
# Generate a new id each time we switch to a new resource group
group_name = "${var.resource_group_name}"
}
length = 8
upper = false
special = false
number = false
}
resource "azurerm_storage_account" "batchstorage" {
name = "${lower(random_string.storage.result)}"
resource_group_name = "${var.resource_group_name}"
location = "${var.resource_group_location}"
account_tier = "Standard"
account_replication_type = "LRS"
}
resource "azurerm_storage_container" "boostrapscript" {
name = "scripts"
resource_group_name = "${var.resource_group_name}"
storage_account_name = "${azurerm_storage_account.batchstorage.name}"
container_access_type = "private"
}
resource "azurerm_storage_blob" "initscript" {
name = "init.sh"
resource_group_name = "${var.resource_group_name}"
storage_account_name = "${azurerm_storage_account.batchstorage.name}"
storage_container_name = "${azurerm_storage_container.boostrapscript.name}"
type = "block"
source = "${var.pool_bootstrap_script_path}"
}
data "azurerm_storage_account_sas" "scriptaccess" {
connection_string = "${azurerm_storage_account.batchstorage.primary_connection_string}"
https_only = true
resource_types {
service = false
container = false
object = true
}
services {
blob = true
queue = false
table = false
file = false
}
start = "${timestamp()}"
expiry = "${timeadd(timestamp(), "8776h")}"
permissions {
read = true
write = false
delete = false
list = false
add = false
create = false
update = false
process = false
}
}
output "pool_boostrap_script_url" {
value = "${azurerm_storage_blob.initscript.url}${data.azurerm_storage_account_sas.scriptaccess.sas}"
}
output "id" {
value = "${azurerm_storage_account.batchstorage.id}"
}

View File

@@ -0,0 +1,14 @@
variable "resource_group_name" {
description = "Resource group name"
type = "string"
}
variable "resource_group_location" {
description = "Resource group location"
type = "string"
}
variable "pool_bootstrap_script_path" {
description = "The filepath of the pool boostrapping script"
type = "string"
}

View File

@@ -0,0 +1,23 @@
variable "client_id" {
type = "string"
description = "Client ID"
}
variable "client_secret" {
type = "string"
description = "Client secret."
}
variable "resource_group_name" {
description = "Resource group name"
type = "string"
}
variable "resource_group_location" {
description = "Resource group location"
type = "string"
}
variable "virtualkubelet_docker_image" {
type = "string"
}

View File

@@ -0,0 +1,14 @@
// Provide the Client ID of a service principal for use by AKS
client_id = "00000000-0000-0000-0000-000000000000"
// Provide the Client Secret of a service principal for use by AKS
client_secret = "00000000-0000-0000-0000-000000000000"
// The resource group you would like to deploy too
resource_group_name = "vkgpu"
// The location of all resources
resource_group_location = "westeurope"
// Virtual Kubelet docker image
virtualkubelet_docker_image = "microsoft/virtual-kubelet"

View File

@@ -0,0 +1,126 @@
provider "kubernetes" {
host = "${var.cluster_host}"
client_certificate = "${var.cluster_client_certificate}"
client_key = "${var.cluster_client_key}"
cluster_ca_certificate = "${var.cluster_ca}"
}
resource "kubernetes_secret" "vkcredentials" {
metadata {
name = "vkcredentials"
}
data {
cert.pem = "${var.cluster_client_certificate}"
key.pem = "${var.cluster_client_key}"
}
}
resource "kubernetes_deployment" "vkdeployment" {
metadata {
name = "vkdeployment"
}
spec {
selector {
app = "virtualkubelet"
}
template {
metadata {
labels {
app = "virtualkubelet"
}
}
spec {
container {
name = "vk"
image = "${var.virtualkubelet_docker_image}"
args = [
"--provider",
"azurebatch",
"--taint",
"azure.com/batch",
"--namespace",
"default",
]
port {
container_port = 10250
protocol = "TCP"
name = "kubeletport"
}
volume_mount {
name = "azure-credentials"
mount_path = "/etc/aks/azure.json"
}
volume_mount {
name = "credentials"
mount_path = "/etc/virtual-kubelet"
}
env = [
{
name = "AZURE_BATCH_ACCOUNT_LOCATION"
value = "${var.resource_group_location}"
},
{
name = "AZURE_BATCH_ACCOUNT_NAME"
value = "${var.azure_batch_account_name}"
},
{
name = "AZURE_BATCH_POOLID"
value = "${var.azure_batch_pool_id}"
},
{
name = "KUBELET_PORT"
value = "10250"
},
{
name = "AZURE_CREDENTIALS_LOCATION"
value = "/etc/aks/azure.json"
},
{
name = "APISERVER_CERT_LOCATION"
value = "/etc/virtual-kubelet/cert.pem"
},
{
name = "APISERVER_KEY_LOCATION"
value = "/etc/virtual-kubelet/key.pem"
},
{
name = "VKUBELET_POD_IP"
value_from {
field_ref {
field_path = "status.podIP"
}
}
},
]
}
volume {
name = "azure-credentials"
host_path {
path = "/etc/kubernetes/azure.json"
}
}
volume {
name = "credentials"
secret {
secret_name = "vkcredentials"
}
}
}
}
}
}

View File

@@ -0,0 +1,41 @@
variable "cluster_client_certificate" {
type = "string"
description = "Cluster client Certificate"
default = "eastus"
}
variable "cluster_client_key" {
type = "string"
description = "Cluster client Certificate Key"
}
variable "cluster_ca" {
type = "string"
description = "Cluster Certificate Authority"
}
variable "cluster_host" {
type = "string"
description = "Cluster Admin API host"
}
variable "virtualkubelet_docker_image" {
type = "string"
description = "The docker image to use for deploying the virtual kubelet"
}
variable "azure_batch_account_name" {
type = "string"
description = "The name of the Azure Batch account to use"
}
variable "azure_batch_pool_id" {
type = "string"
description = "The PoolID to use in Azure batch"
default = "pool1"
}
variable "resource_group_location" {
description = "Resource group location"
type = "string"
}