virtual-kubelet/providers/aws/provider_test.go

package aws_test

import (
	"context"
	"fmt"
	"io/ioutil"
	"os"
	"strings"
	"testing"
	"time"

	"github.com/aws/aws-sdk-go/aws"
	"github.com/aws/aws-sdk-go/aws/session"
	"github.com/aws/aws-sdk-go/service/cloudwatchlogs"
	"github.com/aws/aws-sdk-go/service/ec2"
	"github.com/aws/aws-sdk-go/service/ecs"
	"github.com/aws/aws-sdk-go/service/iam"
	vkAWS "github.com/virtual-kubelet/virtual-kubelet/providers/aws"
	"k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/types"
)

const (
	// E2E test configuration.
	testName          = "vk-fargate-e2e-test"
	defaultTestRegion = "us-east-1"

	// Environment variables that modify the test behavior.
	envSkipTests  = "SKIP_AWS_E2E"
	envTestRegion = "VK_TEST_FARGATE_REGION"
)

// executorRoleAssumePolicy is the policy used by task execution role.
const executorRoleAssumePolicy = `{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Sid": "",
			"Effect": "Allow",
      "Principal": {
        "Service": "ecs-tasks.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}`

// testConfig contains the Fargate provider test configuration template.
const testConfig = `
Region = "%s"
ClusterName = "%s"
Subnets = [ "%s" ]
SecurityGroups = [ ]
AssignPublicIPv4Address = true
ExecutionRoleArn = "%s"
CloudWatchLogGroupName = "%s"
`

var (
	ecsClient        *ecs.ECS
	testRegion       string
	subnetID         *string
	executorRoleName *string
	logGroupName     *string
)

// TestMain wraps the tests with the extra setup and teardown of AWS resources.
func TestMain(m *testing.M) {
	var err error

	// Skip the tests in this package if the environment variable is set.
	if os.Getenv(envSkipTests) == "1" {
		fmt.Println("Skipping AWS E2E tests.")
		os.Exit(0)
	}

	// Query the test region.
	region, ok := os.LookupEnv(envTestRegion)
	if ok {
		testRegion = region
	} else {
		testRegion = defaultTestRegion
	}
	fmt.Printf("Starting provider tests in region %s\n", testRegion)

	// Create the session and clients.
	session := session.New(&aws.Config{
		Region: aws.String(testRegion),
	})

	ecsClient = ecs.New(session)
	ec2Client := ec2.New(session)
	cloudwatchClient := cloudwatchlogs.New(session)
	iamClient := iam.New(session)

	// Create a test VPC with one subnet and internet access.
	// Internet access is required to pull public images from the docker registry.
	subnetID, err = createVpcWithInternetAccess(ec2Client)
	if err != nil {
		fmt.Printf("Failed to create VPC: %+v\n", err)
		os.Exit(-1)
	}

	// Create the AWS CloudWatch Logs log group used by containers.
	logGroupName = aws.String("/ecs/" + testName)
	_, err = cloudwatchClient.CreateLogGroup(&cloudwatchlogs.CreateLogGroupInput{
		LogGroupName: logGroupName,
	})
	if err != nil {
		fmt.Printf("Failed to create CloudWatch Logs log group: %+v\n", err)
		os.Exit(-1)
	}

	// Create the role used by Fargate to write logs and pull ECR images.
	executorRoleName = aws.String(testName)
	_, err = iamClient.CreateRole(&iam.CreateRoleInput{
		RoleName:                 executorRoleName,
		AssumeRolePolicyDocument: aws.String(executorRoleAssumePolicy),
	})
	if err != nil {
		fmt.Printf("Failed to create task execution role: %+v", err)
		os.Exit(-1)
	}

	// Attach the default policy allowing log writes and ECR pulls.
	iamClient.AttachRolePolicy(&iam.AttachRolePolicyInput{
		PolicyArn: aws.String("arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"),
		RoleName:  executorRoleName,
	})
	if err != nil {
		fmt.Printf("Failed to attach role policy: %+v", err)
		os.Exit(-1)
	}

	// Run the tests.
	exitCode := m.Run()

	// Delete the task execution role.
	iamClient.DetachRolePolicy(&iam.DetachRolePolicyInput{
		PolicyArn: aws.String("arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"),
		RoleName:  executorRoleName,
	})
	if err != nil {
		fmt.Printf("Failed to delete task execution role: %+v", err)
	}

	// Delete the role.
	_, err = iamClient.DeleteRole(&iam.DeleteRoleInput{
		RoleName: executorRoleName,
	})
	if err != nil {
		fmt.Printf("Failed to delete task execution role: %+v", err)
	}

	// Delete the log group.
	_, err = cloudwatchClient.DeleteLogGroup(&cloudwatchlogs.DeleteLogGroupInput{
		LogGroupName: logGroupName,
	})
	if err != nil {
		fmt.Printf("Failed to delete CloudWatch Logs log group: %+v\n", err)
	}

	// Delete the test VPC.
	err = deleteVpc(ec2Client)
	if err != nil {
		fmt.Printf("Failed to delete VPC: %+v\n", err)
	}

	os.Exit(exitCode)
}

// TestAWSFargateProviderPodLifecycle validates basic pod lifecycle by starting and stopping a pod.
func TestAWSFargateProviderPodLifecycle(t *testing.T) {
	// Create a cluster for the E2E test.
	createResponse, err := ecsClient.CreateCluster(&ecs.CreateClusterInput{
		ClusterName: aws.String(testName),
	})
	if err != nil {
		t.Error(err)
	}
	clusterID := createResponse.Cluster.ClusterArn

	time.Sleep(10 * time.Second)

	t.Run("Create, list and delete pod", func(t *testing.T) {
		// Write provider config file with test configuration.
		config := fmt.Sprintf(testConfig, testRegion, testName, *subnetID, *executorRoleName, *logGroupName)
		fmt.Printf("Fargate provider test configuration:%s", config)

		tmpfile, err := ioutil.TempFile("", "example")
		if err != nil {
			t.Fatal(err)
		}

		defer os.Remove(tmpfile.Name())

		if _, err = tmpfile.Write([]byte(config)); err != nil {
			t.Fatal(err)
		}

		if err = tmpfile.Close(); err != nil {
			t.Fatal(err)
		}

		// Start the Fargate provider.
		provider, err := vkAWS.NewFargateProvider(
			tmpfile.Name(), nil, testName, "Linux", "1.2.3.4", 10250)
		if err != nil {
			t.Fatal(err)
		}

		// Confirm that there are no pods on the cluster.
		pods, err := provider.GetPods(context.Background())
		if err != nil {
			t.Error(err)
		}
		if len(pods) != 0 {
			t.Errorf("Expect zero pods, but received %d pods\n%v", len(pods), pods)
		}

		// Create a test pod.
		podName := fmt.Sprintf("test_%d", time.Now().UnixNano()/1000)

		pod := &v1.Pod{
			ObjectMeta: metav1.ObjectMeta{
				Name:      podName,
				Namespace: "default",
				UID:       types.UID("unique"),
			},
			Spec: v1.PodSpec{
				Containers: []v1.Container{v1.Container{
					Name:  "echo-container",
					Image: "busybox",
					Command: []string{
						"/bin/sh",
					},
					Args: []string{
						"-c",
						"echo \"Started\";" +
							"echo \"TEST_ENV=$TEST_ENV\";" +
							"while true; do sleep 1; done",
					},
					Env: []v1.EnvVar{
						{Name: "TEST_ENV", Value: "AnyValue"},
					},
					Resources: v1.ResourceRequirements{
						Limits: v1.ResourceList{
							v1.ResourceCPU:    resource.MustParse("200m"),
							v1.ResourceMemory: resource.MustParse("450Mi"),
						},
						Requests: v1.ResourceList{
							v1.ResourceCPU:    resource.MustParse("100m"),
							v1.ResourceMemory: resource.MustParse("256Mi"),
						},
					},
				}},
			},
		}

		err = provider.CreatePod(context.Background(), pod)
		if err != nil {
			t.Fatal(err)
		}

		// Now there should be exactly one pod.
		pods, err = provider.GetPods(context.Background())
		if err != nil {
			t.Error(err)
		}
		if len(pods) != 1 {
			t.Errorf("Expect one pods, but received %d pods\n%v", len(pods), pods)
		}

		// Wait until the pod is running.
		err = waitUntilPodStatus(provider, podName, v1.PodRunning)
		if err != nil {
			t.Error(err)
		}

		// Wait a few seconds for the logs to settle.
		time.Sleep(10 * time.Second)

		logs, err := provider.GetContainerLogs(context.Background(), "default", podName, "echo-container", 100)
		if err != nil {
			t.Error(err)
		}

		// Test log output.
		receivedLogs := strings.Split(logs, "\n")
		expectedLogs := []string{
			"Started",
			pod.Spec.Containers[0].Env[0].Name + "=" + pod.Spec.Containers[0].Env[0].Value,
		}

		for i, line := range receivedLogs {
			fmt.Printf("Log[#%d]: %v\n", i, line)
			if len(expectedLogs) > i && receivedLogs[i] != expectedLogs[i] {
				t.Errorf("Expected log line %d to be %q, but received %q", i, line, receivedLogs[i])
			}
		}

		// Delete the pod.
		err = provider.DeletePod(context.Background(), pod)
		if err != nil {
			t.Fatal(err)
		}

		err = waitUntilPodStatus(provider, podName, v1.PodSucceeded)
		if err != nil {
			t.Error(err)
		}

		// The cluster should be empty again.
		pods, err = provider.GetPods(context.Background())
		if err != nil {
			t.Error(err)
		}
		if len(pods) != 0 {
			t.Errorf("Expect zero pods, but received %d pods\n%v", len(pods), pods)
		}
	})

	// Delete the test cluster.
	_, err = ecsClient.DeleteCluster(&ecs.DeleteClusterInput{
		Cluster: clusterID,
	})
	if err != nil {
		t.Error(err)
	}
}

// waitUntilPodStatus polls pod status until the desired state is reached.
func waitUntilPodStatus(provider *vkAWS.FargateProvider, podName string, desiredStatus v1.PodPhase) error {
	ctx := context.Background()
	context.WithTimeout(ctx, time.Duration(time.Second*60))
	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
			status, err := provider.GetPodStatus(context.Background(), "default", podName)
			if err != nil {
				if strings.Contains(err.Error(), "is not found") {
					return nil
				}

				return err
			}
			if status.Phase == desiredStatus {
				return nil
			}

			time.Sleep(3 * time.Second)
		}
	}
}