Create a provider to use Azure Batch (#133)
* Started work on provider * WIP Adding batch provider * Working basic call into pool client. Need to parameterize the baseurl * Fixed job creation by manipulating the content-type * WIP Kicking off containers. Dirty * [wip] More meat around scheduling simple containers. * Working on basic task wrapper to co-schedule pods * WIP on task wrapper * WIP * Working pod minimal wrapper for batch * Integrate pod template code into provider * Cleaning up * Move to docker without gpu * WIP batch integration * partially working * Working logs * Tidy code * WIP: Testing and readme * Added readme and terraform deployment for GPU Azure Batch pool. * Update to enable low priority nodes for gpu * Fix log formatting bug. Return node logs when container not yet started * Moved to golang v1.10 * Fix cri test * Fix up minor docs Issue. Add provider to readme. Add var for vk image.
This commit is contained in:
committed by
Robbie Zhang
parent
1ad6fb434e
commit
d6e8b3daf7
146
providers/azurebatch/deployment/azurebatch/main.tf
Normal file
146
providers/azurebatch/deployment/azurebatch/main.tf
Normal file
@@ -0,0 +1,146 @@
|
||||
resource "random_string" "batchname" {
|
||||
keepers = {
|
||||
# Generate a new id each time we switch to a new resource group
|
||||
group_name = "${var.resource_group_name}"
|
||||
}
|
||||
|
||||
length = 8
|
||||
upper = false
|
||||
special = false
|
||||
number = false
|
||||
}
|
||||
|
||||
resource "azurerm_template_deployment" "test" {
|
||||
name = "tfdeployment"
|
||||
resource_group_name = "${var.resource_group_name}"
|
||||
|
||||
# these key-value pairs are passed into the ARM Template's `parameters` block
|
||||
parameters {
|
||||
"batchAccountName" = "${random_string.batchname.result}"
|
||||
"storageAccountID" = "${var.storage_account_id}"
|
||||
"poolBoostrapScriptUrl" = "${var.pool_bootstrap_script_url}"
|
||||
"location" = "${var.resource_group_location}"
|
||||
"poolID" = "${var.pool_id}"
|
||||
"vmSku" = "${var.vm_sku}"
|
||||
"lowPriorityNodeCount" = "${var.low_priority_node_count}"
|
||||
"dedicatedNodeCount" = "${var.dedicated_node_count}"
|
||||
}
|
||||
|
||||
deployment_mode = "Incremental"
|
||||
|
||||
template_body = <<DEPLOY
|
||||
{
|
||||
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
|
||||
"contentVersion": "1.0.0.0",
|
||||
"parameters": {
|
||||
"batchAccountName": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "Batch Account Name"
|
||||
}
|
||||
},
|
||||
"poolID": {
|
||||
"type": "string",
|
||||
"metadata": {
|
||||
"description": "GPU Pool ID"
|
||||
}
|
||||
},
|
||||
"dedicatedNodeCount": {
|
||||
"type": "string"
|
||||
},
|
||||
"lowPriorityNodeCount": {
|
||||
"type": "string"
|
||||
},
|
||||
"vmSku": {
|
||||
"type": "string"
|
||||
},
|
||||
"storageAccountID": {
|
||||
"type": "string"
|
||||
},
|
||||
"poolBoostrapScriptUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
"location": {
|
||||
"type": "string",
|
||||
"defaultValue": "[resourceGroup().location]",
|
||||
"metadata": {
|
||||
"description": "Location for all resources."
|
||||
}
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.Batch/batchAccounts",
|
||||
"name": "[parameters('batchAccountName')]",
|
||||
"apiVersion": "2015-12-01",
|
||||
"location": "[parameters('location')]",
|
||||
"tags": {
|
||||
"ObjectName": "[parameters('batchAccountName')]"
|
||||
},
|
||||
"properties": {
|
||||
"autoStorage": {
|
||||
"storageAccountId": "[parameters('storageAccountID')]"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Microsoft.Batch/batchAccounts/pools",
|
||||
"name": "[concat(parameters('batchAccountName'), '/', parameters('poolID'))]",
|
||||
"apiVersion": "2017-09-01",
|
||||
"scale": null,
|
||||
"properties": {
|
||||
"vmSize": "STANDARD_NC6",
|
||||
"interNodeCommunication": "Disabled",
|
||||
"maxTasksPerNode": 1,
|
||||
"taskSchedulingPolicy": {
|
||||
"nodeFillType": "Spread"
|
||||
},
|
||||
"startTask": {
|
||||
"commandLine": "/bin/bash -c ./init.sh",
|
||||
"resourceFiles": [
|
||||
{
|
||||
"blobSource": "[parameters('poolBoostrapScriptUrl')]",
|
||||
"fileMode": "777",
|
||||
"filePath": "./init.sh"
|
||||
}
|
||||
],
|
||||
"userIdentity": {
|
||||
"autoUser": {
|
||||
"elevationLevel": "Admin",
|
||||
"scope": "Pool"
|
||||
}
|
||||
},
|
||||
"waitForSuccess": true,
|
||||
"maxTaskRetryCount": 0
|
||||
},
|
||||
"deploymentConfiguration": {
|
||||
"virtualMachineConfiguration": {
|
||||
"imageReference": {
|
||||
"publisher": "Canonical",
|
||||
"offer": "UbuntuServer",
|
||||
"sku": "16.04-LTS",
|
||||
"version": "latest"
|
||||
},
|
||||
"nodeAgentSkuId": "batch.node.ubuntu 16.04"
|
||||
}
|
||||
},
|
||||
"scaleSettings": {
|
||||
"fixedScale": {
|
||||
"targetDedicatedNodes": "[parameters('dedicatedNodeCount')]",
|
||||
"targetLowPriorityNodes": "[parameters('lowPriorityNodeCount')]",
|
||||
"resizeTimeout": "PT15M"
|
||||
}
|
||||
}
|
||||
},
|
||||
"dependsOn": [
|
||||
"[resourceId('Microsoft.Batch/batchAccounts', parameters('batchAccountName'))]"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
DEPLOY
|
||||
}
|
||||
|
||||
output "name" {
|
||||
value = "${random_string.batchname.result}"
|
||||
}
|
||||
43
providers/azurebatch/deployment/azurebatch/variables.tf
Normal file
43
providers/azurebatch/deployment/azurebatch/variables.tf
Normal file
@@ -0,0 +1,43 @@
|
||||
variable "pool_id" {
|
||||
type = "string"
|
||||
description = "Name of the Azure Batch pool to create."
|
||||
default = "pool1"
|
||||
}
|
||||
|
||||
variable "vm_sku" {
|
||||
type = "string"
|
||||
description = "VM SKU to use - Default to NC6 GPU SKU."
|
||||
default = "STANDARD_NC6"
|
||||
}
|
||||
|
||||
variable "pool_bootstrap_script_url" {
|
||||
type = "string"
|
||||
description = "Publicly accessible url used for boostrapping nodes in the pool. Installing GPU drivers, for example."
|
||||
}
|
||||
|
||||
variable "storage_account_id" {
|
||||
type = "string"
|
||||
description = "Name of the storage account to be used by Azure Batch"
|
||||
}
|
||||
|
||||
variable "resource_group_name" {
|
||||
type = "string"
|
||||
description = "Name of the azure resource group."
|
||||
default = "akc-rg"
|
||||
}
|
||||
|
||||
variable "resource_group_location" {
|
||||
type = "string"
|
||||
description = "Location of the azure resource group."
|
||||
default = "eastus"
|
||||
}
|
||||
|
||||
variable "low_priority_node_count" {
|
||||
type = "string"
|
||||
description = "The number of low priority nodes to allocate to the pool"
|
||||
}
|
||||
|
||||
variable "dedicated_node_count" {
|
||||
type = "string"
|
||||
description = "The number dedicated nodes to allocate to the pool"
|
||||
}
|
||||
Reference in New Issue
Block a user