virtual-kubelet/vendor/github.com/vmware/vic/cmd/tether/attach.go

// Copyright 2016 VMware, Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
	"context"
	"errors"
	"fmt"
	"net"
	"sync"
	"sync/atomic"
	"time"

	log "github.com/Sirupsen/logrus"
	"golang.org/x/crypto/ssh"

	"github.com/vmware/vic/lib/migration/feature"
	"github.com/vmware/vic/lib/tether"
	"github.com/vmware/vic/lib/tether/msgs"
	"github.com/vmware/vic/pkg/serial"
	"github.com/vmware/vic/pkg/trace"
)

const (
	attachChannelType = "attach"
)

// server is the singleton attachServer for the tether - there can be only one
// as the backchannel line protocol may not provide multiplexing of connections
var server AttachServer
var once sync.Once

type AttachServer interface {
	tether.Extension

	start() error
	stop() error
}

// config is a struct that holds Sessions and Execs
type config struct {
	Key []byte

	Sessions map[string]*tether.SessionConfig
	Execs    map[string]*tether.SessionConfig
}

type attachServerSSH struct {
	// serializes data access for exported functions
	m sync.Mutex

	// conn is the underlying net.Conn which carries SSH
	// held directly as it is how we stop the attach server
	conn struct {
		sync.Mutex
		conn net.Conn
	}

	// we pass serverConn to the channelMux goroutine so we need to lock it
	serverConn struct {
		sync.Mutex
		*ssh.ServerConn
	}

	// extension local copy of the bits of config important to attach
	config    config
	sshConfig *ssh.ServerConfig

	enabled int32

	// Cancelable context and its cancel func. Used for resolving the deadlock
	// between run() and stop()
	ctx    context.Context
	cancel context.CancelFunc

	// INTERNAL: must set by testAttachServer only
	testing bool
}

// NewAttachServerSSH either creates a new instance or returns the initialized one
func NewAttachServerSSH() AttachServer {
	once.Do(func() {
		// create a cancelable context and assign it to the CancelFunc
		// it isused for resolving the deadlock between run() and stop()
		// it has a Background parent as we don't want timeouts here,
		// otherwise we may start leaking goroutines in the handshake code
		ctx, cancel := context.WithCancel(context.Background())
		server = &attachServerSSH{
			ctx:    ctx,
			cancel: cancel,
		}
	})
	return server
}

// Reload - tether.Extension implementation
func (t *attachServerSSH) Reload(tconfig *tether.ExecutorConfig) error {
	defer trace.End(trace.Begin("attach reload"))

	t.m.Lock()
	defer t.m.Unlock()

	// We copy this stuff so that we're not referencing the direct config
	// structure if/while it's being updated.
	// The subelements generally have locks or updated in single assignment
	t.config.Key = tconfig.Key
	t.config.Sessions = make(map[string]*tether.SessionConfig)
	for k, v := range tconfig.Sessions {
		t.config.Sessions[k] = v
	}

	t.config.Execs = make(map[string]*tether.SessionConfig)
	for k, v := range tconfig.Execs {
		t.config.Execs[k] = v
	}

	err := server.start()
	if err != nil {
		detail := fmt.Sprintf("unable to start attach server: %s", err)
		log.Error(detail)
		return errors.New(detail)
	}
	return nil
}

// Enable sets the enabled to true
func (t *attachServerSSH) Enable() {
	atomic.StoreInt32(&t.enabled, 1)
}

// Disable sets the enabled to false
func (t *attachServerSSH) Disable() {
	atomic.StoreInt32(&t.enabled, 0)
}

// Enabled returns whether the enabled is true
func (t *attachServerSSH) Enabled() bool {
	return atomic.LoadInt32(&t.enabled) == 1
}

func (t *attachServerSSH) Start() error {
	defer trace.End(trace.Begin(""))

	return nil
}

// Stop needed for tether.Extensions interface
func (t *attachServerSSH) Stop() error {
	defer trace.End(trace.Begin("stop attach server"))

	t.m.Lock()
	defer t.m.Unlock()

	// calling server.start not t.start so that test impl gets invoked
	return server.stop()
}

func (t *attachServerSSH) reload() error {
	t.serverConn.Lock()
	defer t.serverConn.Unlock()

	// push the exec'ed session ids to the portlayer
	if t.serverConn.ServerConn != nil {
		msg := msgs.ContainersMsg{
			IDs: t.sessions(false),
		}
		payload := msg.Marshal()

		ok, _, err := t.serverConn.SendRequest(msgs.ContainersReq, true, payload)
		if !ok || err != nil {
			return fmt.Errorf("failed to send container ids: %s, %t", err, ok)
		}
	}
	return nil
}

func (t *attachServerSSH) start() error {
	defer trace.End(trace.Begin("start attach server"))

	// if we come here while enabled, reload
	if t.Enabled() {
		log.Debugf("Start called while enabled, reloading")
		if err := t.reload(); err != nil {
			log.Warn(err)
		}
		return nil
	}

	// don't assume that the key hasn't changed
	pkey, err := ssh.ParsePrivateKey([]byte(t.config.Key))
	if err != nil {
		detail := fmt.Sprintf("failed to load key for attach: %s", err)
		log.Error(detail)
		return errors.New(detail)
	}

	// An SSH server is represented by a ServerConfig, which holds
	// certificate details and handles authentication of ServerConns.
	// TODO: update this with generated credentials for the appliance
	t.sshConfig = &ssh.ServerConfig{
		PublicKeyCallback: func(c ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) {
			if c.User() == "daemon" {
				return &ssh.Permissions{}, nil
			}
			return nil, fmt.Errorf("expected daemon user")
		},
		PasswordCallback: func(c ssh.ConnMetadata, pass []byte) (*ssh.Permissions, error) {
			if c.User() == "daemon" {
				return &ssh.Permissions{}, nil
			}
			return nil, fmt.Errorf("expected daemon user")
		},
		NoClientAuth: true,
	}
	t.sshConfig.AddHostKey(pkey)

	// enable the server and start it
	t.Enable()
	go t.run()

	return nil
}

// stop is not thread safe with start
func (t *attachServerSSH) stop() error {
	defer trace.End(trace.Begin("stop attach server"))

	if t == nil {
		err := fmt.Errorf("attach server is not configured")
		log.Error(err)
		return err
	}

	if !t.Enabled() {
		err := fmt.Errorf("attach server is not enabled")
		log.Error(err)
		return err
	}

	// disable the server
	t.Disable()

	// This context is used by backchannel only. We need to cancel it before
	// trying to obtain the following lock so that backchannel interrupts the
	// underlying Read call by calling Close on it.
	// The lock is held by backchannel's caller and not released until it returns
	log.Debugf("Canceling AttachServer's context")
	t.cancel()

	t.conn.Lock()
	if t.conn.conn != nil {
		log.Debugf("Close called again on rawconn - squashing")
		// #nosec: Errors unhandled.
		t.conn.conn.Close()
		t.conn.conn = nil
	}
	t.conn.Unlock()

	return nil
}

func backchannel(ctx context.Context, conn net.Conn) error {
	defer trace.End(trace.Begin("establish tether backchannel"))

	// used for shutting down the goroutine cleanly otherwise we leak a goroutine for every successful return from this function
	done := make(chan struct{})

	// HACK: currently RawConn dosn't implement timeout so throttle the spinning
	// it does implement the Timeout methods so the intermediary code can be written
	// to support it, but they are stub implementation in rawconn impl.

	// This needs to tick *faster* than the ticker in connection.go on the
	// portlayer side.  The PL sends the first syn and if this isn't waiting,
	// alignment will take a few rounds (or it may never happen).
	ticker := time.NewTicker(10 * time.Millisecond)
	defer ticker.Stop()

	// We run this in a separate goroutine because HandshakeServer
	// calls a Read on rawconn which is a blocking call which causes
	// the caller to block as well so this is the only way to cancel.
	// Calling Close() will unblock us and on the next tick we will
	// return ctx.Err()
	go func() {
		select {
		case <-ctx.Done():
			conn.Close()
		case <-done:
			return
		}
	}()

	for {
		select {
		case <-ticker.C:
			if ctx.Err() != nil {
				return ctx.Err()
			}
			deadline, ok := ctx.Deadline()
			if ok {
				conn.SetReadDeadline(deadline)
			}

			err := serial.HandshakeServer(conn)
			if err == nil {
				conn.SetReadDeadline(time.Time{})
				close(done)
				return nil
			}

			switch et := err.(type) {
			case *serial.HandshakeError:
				log.Debugf("HandshakeServer: %v", et)
			default:
				log.Errorf("HandshakeServer: %v", err)
			}
		}
	}
}

func (t *attachServerSSH) establish() error {
	var err error

	// we hold the t.conn.Lock during the scope of this function
	t.conn.Lock()
	defer t.conn.Unlock()

	// tests are passing their own connections so do not create connections when testing is set
	if !t.testing {
		// close the connection if required
		if t.conn.conn != nil {
			// #nosec: Errors unhandled.
			t.conn.conn.Close()
			t.conn.conn = nil
		}
		t.conn.conn, err = rawConnectionFromSerial()
		if err != nil {
			detail := fmt.Errorf("failed to create raw connection: %s", err)
			log.Error(detail)
			return detail
		}
	} else {
		// A series of unfortunate events can lead calling backchannel with nil when we run unit tests.
		// https://github.com/vmware/vic/pull/5327#issuecomment-305619860
		// This check is here to handle that
		if t.conn.conn == nil {
			return fmt.Errorf("nil connection")
		}
	}

	// wait for backchannel to establish
	err = backchannel(t.ctx, t.conn.conn)
	if err != nil {
		detail := fmt.Errorf("failed to establish backchannel: %s", err)
		log.Error(detail)
		return detail
	}

	return nil
}

func (t *attachServerSSH) cleanup() {
	t.serverConn.Lock()
	defer t.serverConn.Unlock()

	log.Debugf("cleanup on connection")

	if t.serverConn.ServerConn != nil {
		log.Debugf("closing underlying connection")
		t.serverConn.Close()
		t.serverConn.ServerConn = nil
	}
}

// run should not be called directly, but via start
// run will establish an ssh server listening on the backchannel
func (t *attachServerSSH) run() error {
	defer trace.End(trace.Begin("main attach server loop"))

	var established bool

	var chans <-chan ssh.NewChannel
	var reqs <-chan *ssh.Request
	var err error

	// main run loop
	for t.Enabled() {
		t.serverConn.Lock()
		established = t.serverConn.ServerConn != nil
		t.serverConn.Unlock()

		// keep waiting for the connection to establish
		for !established && t.Enabled() {
			log.Infof("Trying to establish a connection")

			if err := t.establish(); err != nil {
				log.Error(err)
				continue
			}

			// create the SSH server using underlying t.conn
			t.serverConn.Lock()

			t.serverConn.ServerConn, chans, reqs, err = ssh.NewServerConn(t.conn.conn, t.sshConfig)
			if err != nil {
				detail := fmt.Errorf("failed to establish ssh handshake: %s", err)
				log.Error(detail)
			}
			established = t.serverConn.ServerConn != nil

			t.serverConn.Unlock()
		}

		// Global requests
		go t.globalMux(reqs, t.cleanup)

		log.Infof("Ready to service attach requests")
		// Service the incoming channels
		for attachchan := range chans {
			// The only channel type we'll support is attach
			if attachchan.ChannelType() != attachChannelType {
				detail := fmt.Sprintf("unknown channel type %s", attachchan.ChannelType())
				attachchan.Reject(ssh.UnknownChannelType, detail)
				log.Error(detail)
				continue
			}

			// check we have a Session matching the requested ID
			bytes := attachchan.ExtraData()
			if bytes == nil {
				detail := "attach channel requires ID in ExtraData"
				attachchan.Reject(ssh.Prohibited, detail)
				log.Error(detail)
				continue
			}

			sessionid := string(bytes)

			s, oks := t.config.Sessions[sessionid]
			e, oke := t.config.Execs[sessionid]
			if !oks && !oke {
				detail := fmt.Sprintf("session %s is invalid", sessionid)
				attachchan.Reject(ssh.Prohibited, detail)
				log.Error(detail)
				continue
			}

			// we have sessionid
			session := s
			if oke {
				session = e
			}

			// session is potentially blocked in launch until we've got the unblock message, so we cannot lock it.
			// check that session is valid
			// The detail remains concise as it'll eventually make its way to the user
			if session.Started != "" && session.Started != "true" {
				detail := fmt.Sprintf("launch failed with: %s", session.Started)
				attachchan.Reject(ssh.Prohibited, detail)
				log.Error(detail)
				continue
			}

			if session.StopTime != 0 {
				detail := fmt.Sprintf("process finished with exit code: %d", session.ExitStatus)
				attachchan.Reject(ssh.Prohibited, detail)
				log.Error(detail)
				continue
			}

			channel, requests, err := attachchan.Accept()
			if err != nil {
				detail := fmt.Sprintf("could not accept channel: %s", err)
				log.Errorf(detail)
				continue
			}

			// bind the channel to the Session
			log.Debugf("binding reader/writers for channel for %s", sessionid)

			log.Debugf("Adding [%p] to Outwriter", channel)
			session.Outwriter.Add(channel)
			log.Debugf("Adding [%p] to Reader", channel)
			session.Reader.Add(channel)

			// cleanup on detach from the session
			cleanup := func() {
				log.Debugf("Cleanup on detach from the session")

				log.Debugf("Removing [%p] from Outwriter", channel)
				session.Outwriter.Remove(channel)

				log.Debugf("Removing [%p] from Reader", channel)
				session.Reader.Remove(channel)

				channel.Close()
			}

			detach := cleanup
			// tty's merge stdout and stderr so we don't bind an additional reader in that case but we need to do so for non-tty
			if !session.Tty {
				// persist the value as we end up with different values each time we access it
				stderr := channel.Stderr()

				log.Debugf("Adding [%p] to Errwriter", stderr)
				session.Errwriter.Add(stderr)

				detach = func() {
					log.Debugf("Cleanup on detach from the session (non-tty)")

					log.Debugf("Removing [%p] from Errwriter", stderr)
					session.Errwriter.Remove(stderr)

					cleanup()
				}
			}
			log.Debugf("reader/writers bound for channel for %s", sessionid)

			go t.channelMux(requests, session, detach)
		}
		log.Info("Incoming attach channel closed")
	}
	return nil
}

func (t *attachServerSSH) sessions(all bool) []string {
	defer trace.End(trace.Begin(""))

	var keys []string

	// this iterates the local copies of the sessions maps
	// so we don't need to care whether they're initialized or not
	// as extension reload comes after that point

	// whether include sessions or not
	if all {
		for k, v := range t.config.Sessions {
			if v.Active && v.StopTime == 0 {
				keys = append(keys, k)
			}
		}
	}

	for k, v := range t.config.Execs {
		// skip those that have had launch errors
		if v.Active && v.StopTime == 0 && (v.Started == "" || v.Started == "true") {
			keys = append(keys, k)
		}
	}

	log.Debugf("Returning %d keys", len(keys))
	return keys
}

func (t *attachServerSSH) globalMux(in <-chan *ssh.Request, cleanup func()) {
	defer trace.End(trace.Begin("attach server global request handler"))

	// cleanup function passed by the caller
	defer cleanup()

	// for the actions after we process the request
	var pendingFn func()
	for req := range in {
		var payload []byte
		ok := true

		log.Infof("received global request type %v", req.Type)

		switch req.Type {
		case msgs.ContainersReq:
			msg := msgs.ContainersMsg{
				IDs: t.sessions(true),
			}
			payload = msg.Marshal()
		case msgs.VersionReq:
			msg := msgs.VersionMsg{
				Version: feature.MaxPluginVersion - 1,
			}
			payload = msg.Marshal()
		default:
			ok = false
			payload = []byte("unknown global request type: " + req.Type)
		}

		log.Debugf("Returning payload: %s", string(payload))

		// make sure that errors get send back if we failed
		if req.WantReply {
			log.Debugf("Sending global request reply %t back with %#v", ok, payload)
			if err := req.Reply(ok, payload); err != nil {
				log.Warnf("Failed to reply a global request back")
			}
		}

		// run any pending work now that a reply has been sent
		if pendingFn != nil {
			log.Debug("Invoking pending work for global mux")
			go pendingFn()
			pendingFn = nil
		}
	}
}

func (t *attachServerSSH) channelMux(in <-chan *ssh.Request, session *tether.SessionConfig, cleanup func()) {
	defer trace.End(trace.Begin("attach server channel request handler"))

	// cleanup function passed by the caller
	defer cleanup()

	// to make sure we close the channel once
	var once sync.Once

	// for the actions after we process the request
	var pendingFn func()
	for req := range in {
		ok := true
		abort := false

		log.Infof("received channel mux type %v", req.Type)

		switch req.Type {
		case msgs.PingReq:
			log.Infof("Received PingReq for %s", session.ID)

			if string(req.Payload) != msgs.PingMsg {
				log.Infof("Received corrupted PingReq for %s", session.ID)
				ok = false
			}
		case msgs.UnblockReq:
			log.Infof("Received UnblockReq for %s", session.ID)

			if string(req.Payload) != msgs.UnblockMsg {
				log.Infof("Received corrupted UnblockReq for %s", session.ID)
				ok = false
				break
			}

			// if the process has exited, or couldn't launch
			if session.Started != "" && session.Started != "true" {
				// we need to force the session closed so that error handling occurs on the callers
				// side
				ok = false
				abort = true
			} else {
				// unblock ^ (above)
				pendingFn = func() {
					once.Do(func() {
						launchChan := session.ClearToLaunch
						if session.RunBlock && launchChan != nil && session.Started == "" {
							log.Infof("Unblocking the launch of %s", session.Common.ID)
							// make sure that portlayer received the container id back
							launchChan <- struct{}{}
							log.Infof("Unblocked the launch of %s", session.Common.ID)
						}
					})
				}
			}

		case msgs.WindowChangeReq:
			session.Lock()
			pty := session.Pty
			session.Unlock()

			msg := msgs.WindowChangeMsg{}
			if pty == nil {
				ok = false
				log.Errorf("illegal window-change request for non-tty")
			} else if err := msg.Unmarshal(req.Payload); err != nil {
				ok = false
				log.Errorf(err.Error())
			} else if err := resizePty(pty.Fd(), &msg); err != nil {
				ok = false
				log.Errorf(err.Error())
			}
		case msgs.CloseStdinReq:
			log.Infof("Received CloseStdinReq for %s", session.ID)

			log.Debugf("Configuring reader to propagate EOF for %s", session.ID)
			session.Reader.PropagateEOF(true)
		default:
			ok = false
			log.Error(fmt.Sprintf("ssh request type %s is not supported", req.Type))
		}

		// payload is ignored on channel specific replies.  The ok is passed, however.
		if req.WantReply {
			log.Debugf("Sending channel request reply %t back", ok)
			if err := req.Reply(ok, nil); err != nil {
				log.Warnf("Failed replying to a channel request: %s", err)
			}
		}

		// run any pending work now that a reply has been sent
		if pendingFn != nil {
			log.Debug("Invoking pending work for channel mux")
			go pendingFn()
			pendingFn = nil
		}

		if abort {
			break
		}
	}
}

// The syscall struct
type winsize struct {
	wsRow    uint16
	wsCol    uint16
	wsXpixel uint16
	wsYpixel uint16
}