Create a provider to use Azure Batch (#133)

* Started work on provider

* WIP Adding batch provider

* Working basic call into pool client. Need to parameterize the baseurl

* Fixed job creation by manipulating the content-type

* WIP Kicking off containers. Dirty

* [wip] More meat around scheduling simple containers.

* Working on basic task wrapper to co-schedule pods

* WIP on task wrapper

* WIP

* Working pod minimal wrapper for batch

* Integrate pod template code into provider

* Cleaning up

* Move to docker without gpu

* WIP batch integration

* partially working

* Working logs

* Tidy code

* WIP: Testing and readme

* Added readme and terraform deployment for GPU Azure Batch pool.

* Update to enable low priority nodes for gpu

* Fix log formatting bug. Return node logs when container not yet started

* Moved to golang v1.10

* Fix cri test

* Fix up minor docs Issue. Add provider to readme. Add var for vk image.
This commit is contained in:
Lawrence Gripper
2018-06-23 00:33:49 +01:00
committed by Robbie Zhang
parent 1ad6fb434e
commit d6e8b3daf7
75 changed files with 20040 additions and 6 deletions

21
vendor/github.com/lawrencegripper/pod2docker/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2018 Lawrence Gripper
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,105 @@
package pod2docker
import (
"bytes"
"strings"
"text/template"
"k8s.io/api/core/v1"
)
// ImageRegistryCredential - Used to input a credential used by docker login
type ImageRegistryCredential struct {
Server string `json:"server,omitempty"`
Username string `json:"username,omitempty"`
Password string `json:"password,omitempty"`
}
// PodComponents provides details to run a pod
type PodComponents struct {
PullCredentials []ImageRegistryCredential
InitContainers []v1.Container
Containers []v1.Container
Volumes []v1.Volume
PodName string
}
// GetBashCommand generates the bash script to execute the pod
func GetBashCommand(p PodComponents) (string, error) {
template := template.New("run.sh.tmpl").Option("missingkey=error").Funcs(template.FuncMap{
"getLaunchCommand": getLaunchCommand,
"isHostPathVolume": isHostPathVolume,
"isEmptyDirVolume": isEmptyDirVolume,
"isPullAlways": isPullAlways,
"getValidVolumeMounts": getValidVolumeMounts,
"isNvidiaRuntime": isNvidiaRuntime,
})
template, err := template.Parse(azureBatchPodTemplate)
if err != nil {
return "", err
}
var output bytes.Buffer
err = template.Execute(&output, p)
return output.String(), err
}
func getLaunchCommand(container v1.Container) (cmd string) {
if len(container.Command) > 0 {
cmd += strings.Join(container.Command, " ")
}
if len(cmd) > 0 {
cmd += " "
}
if len(container.Args) > 0 {
cmd += strings.Join(container.Args, " ")
}
return
}
func isNvidiaRuntime(c v1.Container) bool {
if _, exists := c.Resources.Limits["nvidia.com/gpu"]; exists {
return true
}
return false
}
func isHostPathVolume(v v1.Volume) bool {
if v.HostPath == nil {
return false
}
return true
}
func isEmptyDirVolume(v v1.Volume) bool {
if v.EmptyDir == nil {
return false
}
return true
}
func isPullAlways(c v1.Container) bool {
if c.ImagePullPolicy == v1.PullAlways {
return true
}
return false
}
func getValidVolumeMounts(container v1.Container, volumes []v1.Volume) []v1.VolumeMount {
volDic := make(map[string]v1.Volume)
for _, vol := range volumes {
volDic[vol.Name] = vol
}
var mounts []v1.VolumeMount
for _, mount := range container.VolumeMounts {
vol, ok := volDic[mount.Name]
if !ok {
continue
}
if vol.EmptyDir == nil && vol.HostPath == nil {
continue
}
mounts = append(mounts, mount)
}
return mounts
}

View File

@@ -0,0 +1,167 @@
package pod2docker
// Todo: Investigate a better way to inline this template - especially when escaping the backticks.
// Consider: https://mattjibson.com/blog/2014/11/19/esc-embedding-static-assets/
const azureBatchPodTemplate = `
#!/bin/bash
set -eE
trap cleanup EXIT
if ! type 'docker' > /dev/null; then
echo 'Docker not installed... exiting'
exit 1
fi
{{/* Vars */}}
{{$podName := .PodName}}
{{$volumes := .Volumes}}
{{/* Login to required image repositories */}}
{{range .PullCredentials }}
docker login -u {{.Username}} -p {{.Password}} {{.Server}}
{{end}}
function cleanup(){
{{/* Take a copy of the container log is removed when container is deleted */}}
echo 'Pod Exited: Copying logs'
{{range $index, $container := .InitContainers}}
if [[ -f ./initcontainer-{{$index}}.cid ]]; then
container_{{$index}}_ID=$(<./initcontainer-{{$index}}.cid)
container_{{$index}}_Log_Path=$(docker inspect --format='{{"{{.LogPath}}"}}' $container_{{$index}}_ID)
cp $container_{{$index}}_Log_Path ./{{$container.Name}}.log
docker rm -f $container_{{$index}}_ID
rm -f ./initcontainer-{{$index}}.cid
fi
{{end}}
{{range $index, $container := .Containers}}
if [[ -f ./{{$container.Name}}.log && -f ./container-{{$index}}.cid ]]; then
container_{{$index}}_ID=$(<./container-{{$index}}.cid)
container_{{$index}}_Log_Path=$(docker inspect --format='{{"{{.LogPath}}"}}' $container_{{$index}}_ID)
rm ./{{$container.Name}}.log {{/* Remove the existing symlink */}}
cp $container_{{$index}}_Log_Path ./{{$container.Name}}.log
fi
{{end}}
{{/* Remove the containers, network and volumes */}}
echo 'Pod Exited: Removing all containers'
if ls container-* 1> /dev/null 2>&1; then
for line in ` + "`ls container-*`" + `
do
id=$(cat $line)
echo '-Logs container..'
docker logs $id
echo '-Removing container..'
docker rm -f $id
rm -f $line
done
fi
echo '-Removing pause container..'
docker rm -f {{$podName}} || echo 'Remove pause container failed'
rm -f ./pauseid.cid
echo '-Removing network container..'
docker network rm {{$podName}} || echo 'Remove network failed'
echo '-Removing volumes..'
{{range .Volumes}}
docker volume rm -f {{$podName}}_{{.Name}} || echo 'Remove volume failed'
{{end}}
}
{{/* Create Pod network and start it */}}
docker network create {{$podName}}
docker run -d --network {{$podName}} --name {{$podName}} --cidfile="./pauseid.cid" gcr.io/google_containers/pause:1.0
{{/* Handle volumes */}}
{{range .Volumes}}
{{if isHostPathVolume .}}
docker volume create --name {{$podName}}_{{.Name}} --opt type=none --opt device={{.VolumeSource.HostPath.Path}} --opt o=bind
{{end}}
{{if isEmptyDirVolume .}}
docker volume create {{$podName}}_{{.Name}}
{{end}}
{{end}}
{{/* Run the init containers in the Pod. Attaching to shared namespace */}}
{{range $index, $container := .InitContainers}}
echo 'Running init container {{$index}}..'
{{if isPullAlways .}}
docker pull {{$container.Image}}
{{end}}
docker run --network container:{{$podName}} --ipc container:{{$podName}} \
{{- if isNvidiaRuntime $container}}
--runtime nvidia \
{{- end}}
{{- range $index, $envs := $container.Env}}
-e "{{$envs.Name}}:{{$envs.Value}}" \
{{- end}}
{{- range $index, $mount := getValidVolumeMounts $container $volumes}}
-v {{$podName}}_{{$mount.Name}}:{{$mount.MountPath}} \
{{- end}}
--cidfile=./initcontainer-{{$index}}.cid {{$container.Image}} {{getLaunchCommand $container}}
{{end}}
{{/* Run the containers in the Pod. Attaching to shared namespace */}}
{{range $index, $container := .Containers}}
{{if isPullAlways .}}
docker pull {{$container.Image}}
{{end}}
docker run -d --network container:{{$podName}} --ipc container:{{$podName}} \
{{- if isNvidiaRuntime $container}}
--runtime nvidia \
{{- end}}
{{- range $index, $envs := $container.Env}}
-e "{{$envs.Name}}:{{$envs.Value}}" \
{{- end}}
{{- range $index, $mount := getValidVolumeMounts $container $volumes}}
-v {{$podName}}_{{$mount.Name}}:{{$mount.MountPath}} \
{{- end}}
--cidfile=./container-{{$index}}.cid {{$container.Image}} {{getLaunchCommand $container}}
{{end}}
{{/* Symlink all container logs files to task directory */}}
{{range $index, $container := .Containers}}
container_{{$index}}_ID=$(<./container-{{$index}}.cid)
container_{{$index}}_Log_Path=$(docker inspect --format='{{"{{.LogPath}}"}}' $container_{{$index}}_ID)
ln -f -s $container_{{$index}}_Log_Path ./{{$container.Name}}.log
{{end}}
echo 'Running Pod: {{.PodName}}'
{{/* Wait until any of these containers stop */}}
echo 'Waiting for any of the containers to exit'
for line in ` + "`ls container-*`" + `
do
id=$(cat $line)
docker wait $id &
done
while [ $(jobs -p | wc -l) == {{.Containers | len}} ]
do
sleep 2
done
{{/* Get exit codes from containers */}}
echo 'Checking container exit codes'
overallExitCode=0
for line in ` + "`ls container-*`" + `
do
id=$(cat $line)
echo 'Getting exitcode'
exitCode=$(docker inspect -f {{"{{.State.ExitCode}}"}} $id)
echo 'ID: ' $id ' ExitCode: ' $exitCode
echo 'Checking exitcode'
if (($exitCode != 0))
then
echo 'Assigning exitcode'
overallExitCode=$exitCode
fi
done
exit $overallExitCode
`