Add HashiCorp Nomad provider (#483)

* provider: adding Nomad provider * updating CONTRIBUTING.md with Nomad provider * updated README.md by adding the Nomad provider * fix typo * adding nomad/api and nomad/testutil deps * adding Nomad binary dependency for provider tests * fixed the nomad binary download command step and added tolerations to the nomad provider. * adding nomad provider demo gif * adding my name to authors * adding two missing go-rootcerts files after dep ensure * delete pod comment
2019-01-08 01:18:11 +05:30
parent 5796be449b
commit a46e1dd2ce
332 changed files with 126455 additions and 2 deletions
--- a/vendor/github.com/hashicorp/raft/replication.go
+++ b/vendor/github.com/hashicorp/raft/replication.go
@@ -0,0 +1,561 @@
+package raft
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/armon/go-metrics"
+)
+
+const (
+	maxFailureScale = 12
+	failureWait     = 10 * time.Millisecond
+)
+
+var (
+	// ErrLogNotFound indicates a given log entry is not available.
+	ErrLogNotFound = errors.New("log not found")
+
+	// ErrPipelineReplicationNotSupported can be returned by the transport to
+	// signal that pipeline replication is not supported in general, and that
+	// no error message should be produced.
+	ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported")
+)
+
+// followerReplication is in charge of sending snapshots and log entries from
+// this leader during this particular term to a remote follower.
+type followerReplication struct {
+	// peer contains the network address and ID of the remote follower.
+	peer Server
+
+	// commitment tracks the entries acknowledged by followers so that the
+	// leader's commit index can advance. It is updated on successsful
+	// AppendEntries responses.
+	commitment *commitment
+
+	// stopCh is notified/closed when this leader steps down or the follower is
+	// removed from the cluster. In the follower removed case, it carries a log
+	// index; replication should be attempted with a best effort up through that
+	// index, before exiting.
+	stopCh chan uint64
+	// triggerCh is notified every time new entries are appended to the log.
+	triggerCh chan struct{}
+
+	// currentTerm is the term of this leader, to be included in AppendEntries
+	// requests.
+	currentTerm uint64
+	// nextIndex is the index of the next log entry to send to the follower,
+	// which may fall past the end of the log.
+	nextIndex uint64
+
+	// lastContact is updated to the current time whenever any response is
+	// received from the follower (successful or not). This is used to check
+	// whether the leader should step down (Raft.checkLeaderLease()).
+	lastContact time.Time
+	// lastContactLock protects 'lastContact'.
+	lastContactLock sync.RWMutex
+
+	// failures counts the number of failed RPCs since the last success, which is
+	// used to apply backoff.
+	failures uint64
+
+	// notifyCh is notified to send out a heartbeat, which is used to check that
+	// this server is still leader.
+	notifyCh chan struct{}
+	// notify is a list of futures to be resolved upon receipt of an
+	// acknowledgement, then cleared from this list.
+	notify []*verifyFuture
+	// notifyLock protects 'notify'.
+	notifyLock sync.Mutex
+
+	// stepDown is used to indicate to the leader that we
+	// should step down based on information from a follower.
+	stepDown chan struct{}
+
+	// allowPipeline is used to determine when to pipeline the AppendEntries RPCs.
+	// It is private to this replication goroutine.
+	allowPipeline bool
+}
+
+// notifyAll is used to notify all the waiting verify futures
+// if the follower believes we are still the leader.
+func (s *followerReplication) notifyAll(leader bool) {
+	// Clear the waiting notifies minimizing lock time
+	s.notifyLock.Lock()
+	n := s.notify
+	s.notify = nil
+	s.notifyLock.Unlock()
+
+	// Submit our votes
+	for _, v := range n {
+		v.vote(leader)
+	}
+}
+
+// LastContact returns the time of last contact.
+func (s *followerReplication) LastContact() time.Time {
+	s.lastContactLock.RLock()
+	last := s.lastContact
+	s.lastContactLock.RUnlock()
+	return last
+}
+
+// setLastContact sets the last contact to the current time.
+func (s *followerReplication) setLastContact() {
+	s.lastContactLock.Lock()
+	s.lastContact = time.Now()
+	s.lastContactLock.Unlock()
+}
+
+// replicate is a long running routine that replicates log entries to a single
+// follower.
+func (r *Raft) replicate(s *followerReplication) {
+	// Start an async heartbeating routing
+	stopHeartbeat := make(chan struct{})
+	defer close(stopHeartbeat)
+	r.goFunc(func() { r.heartbeat(s, stopHeartbeat) })
+
+RPC:
+	shouldStop := false
+	for !shouldStop {
+		select {
+		case maxIndex := <-s.stopCh:
+			// Make a best effort to replicate up to this index
+			if maxIndex > 0 {
+				r.replicateTo(s, maxIndex)
+			}
+			return
+		case <-s.triggerCh:
+			lastLogIdx, _ := r.getLastLog()
+			shouldStop = r.replicateTo(s, lastLogIdx)
+		case <-randomTimeout(r.conf.CommitTimeout): // TODO: what is this?
+			lastLogIdx, _ := r.getLastLog()
+			shouldStop = r.replicateTo(s, lastLogIdx)
+		}
+
+		// If things looks healthy, switch to pipeline mode
+		if !shouldStop && s.allowPipeline {
+			goto PIPELINE
+		}
+	}
+	return
+
+PIPELINE:
+	// Disable until re-enabled
+	s.allowPipeline = false
+
+	// Replicates using a pipeline for high performance. This method
+	// is not able to gracefully recover from errors, and so we fall back
+	// to standard mode on failure.
+	if err := r.pipelineReplicate(s); err != nil {
+		if err != ErrPipelineReplicationNotSupported {
+			r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err)
+		}
+	}
+	goto RPC
+}
+
+// replicateTo is a helper to replicate(), used to replicate the logs up to a
+// given last index.
+// If the follower log is behind, we take care to bring them up to date.
+func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) {
+	// Create the base request
+	var req AppendEntriesRequest
+	var resp AppendEntriesResponse
+	var start time.Time
+START:
+	// Prevent an excessive retry rate on errors
+	if s.failures > 0 {
+		select {
+		case <-time.After(backoff(failureWait, s.failures, maxFailureScale)):
+		case <-r.shutdownCh:
+		}
+	}
+
+	// Setup the request
+	if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound {
+		goto SEND_SNAP
+	} else if err != nil {
+		return
+	}
+
+	// Make the RPC call
+	start = time.Now()
+	if err := r.trans.AppendEntries(s.peer.ID, s.peer.Address, &req, &resp); err != nil {
+		r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err)
+		s.failures++
+		return
+	}
+	appendStats(string(s.peer.ID), start, float32(len(req.Entries)))
+
+	// Check for a newer term, stop running
+	if resp.Term > req.Term {
+		r.handleStaleTerm(s)
+		return true
+	}
+
+	// Update the last contact
+	s.setLastContact()
+
+	// Update s based on success
+	if resp.Success {
+		// Update our replication state
+		updateLastAppended(s, &req)
+
+		// Clear any failures, allow pipelining
+		s.failures = 0
+		s.allowPipeline = true
+	} else {
+		s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1)
+		if resp.NoRetryBackoff {
+			s.failures = 0
+		} else {
+			s.failures++
+		}
+		r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex)
+	}
+
+CHECK_MORE:
+	// Poll the stop channel here in case we are looping and have been asked
+	// to stop, or have stepped down as leader. Even for the best effort case
+	// where we are asked to replicate to a given index and then shutdown,
+	// it's better to not loop in here to send lots of entries to a straggler
+	// that's leaving the cluster anyways.
+	select {
+	case <-s.stopCh:
+		return true
+	default:
+	}
+
+	// Check if there are more logs to replicate
+	if s.nextIndex <= lastIndex {
+		goto START
+	}
+	return
+
+	// SEND_SNAP is used when we fail to get a log, usually because the follower
+	// is too far behind, and we must ship a snapshot down instead
+SEND_SNAP:
+	if stop, err := r.sendLatestSnapshot(s); stop {
+		return true
+	} else if err != nil {
+		r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err)
+		return
+	}
+
+	// Check if there is more to replicate
+	goto CHECK_MORE
+}
+
+// sendLatestSnapshot is used to send the latest snapshot we have
+// down to our follower.
+func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) {
+	// Get the snapshots
+	snapshots, err := r.snapshots.List()
+	if err != nil {
+		r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err)
+		return false, err
+	}
+
+	// Check we have at least a single snapshot
+	if len(snapshots) == 0 {
+		return false, fmt.Errorf("no snapshots found")
+	}
+
+	// Open the most recent snapshot
+	snapID := snapshots[0].ID
+	meta, snapshot, err := r.snapshots.Open(snapID)
+	if err != nil {
+		r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err)
+		return false, err
+	}
+	defer snapshot.Close()
+
+	// Setup the request
+	req := InstallSnapshotRequest{
+		RPCHeader:          r.getRPCHeader(),
+		SnapshotVersion:    meta.Version,
+		Term:               s.currentTerm,
+		Leader:             r.trans.EncodePeer(r.localID, r.localAddr),
+		LastLogIndex:       meta.Index,
+		LastLogTerm:        meta.Term,
+		Peers:              meta.Peers,
+		Size:               meta.Size,
+		Configuration:      encodeConfiguration(meta.Configuration),
+		ConfigurationIndex: meta.ConfigurationIndex,
+	}
+
+	// Make the call
+	start := time.Now()
+	var resp InstallSnapshotResponse
+	if err := r.trans.InstallSnapshot(s.peer.ID, s.peer.Address, &req, &resp, snapshot); err != nil {
+		r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err)
+		s.failures++
+		return false, err
+	}
+	metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", string(s.peer.ID)}, start)
+
+	// Check for a newer term, stop running
+	if resp.Term > req.Term {
+		r.handleStaleTerm(s)
+		return true, nil
+	}
+
+	// Update the last contact
+	s.setLastContact()
+
+	// Check for success
+	if resp.Success {
+		// Update the indexes
+		s.nextIndex = meta.Index + 1
+		s.commitment.match(s.peer.ID, meta.Index)
+
+		// Clear any failures
+		s.failures = 0
+
+		// Notify we are still leader
+		s.notifyAll(true)
+	} else {
+		s.failures++
+		r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer)
+	}
+	return false, nil
+}
+
+// heartbeat is used to periodically invoke AppendEntries on a peer
+// to ensure they don't time out. This is done async of replicate(),
+// since that routine could potentially be blocked on disk IO.
+func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) {
+	var failures uint64
+	req := AppendEntriesRequest{
+		RPCHeader: r.getRPCHeader(),
+		Term:      s.currentTerm,
+		Leader:    r.trans.EncodePeer(r.localID, r.localAddr),
+	}
+	var resp AppendEntriesResponse
+	for {
+		// Wait for the next heartbeat interval or forced notify
+		select {
+		case <-s.notifyCh:
+		case <-randomTimeout(r.conf.HeartbeatTimeout / 10):
+		case <-stopCh:
+			return
+		}
+
+		start := time.Now()
+		if err := r.trans.AppendEntries(s.peer.ID, s.peer.Address, &req, &resp); err != nil {
+			r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer.Address, err)
+			failures++
+			select {
+			case <-time.After(backoff(failureWait, failures, maxFailureScale)):
+			case <-stopCh:
+			}
+		} else {
+			s.setLastContact()
+			failures = 0
+			metrics.MeasureSince([]string{"raft", "replication", "heartbeat", string(s.peer.ID)}, start)
+			s.notifyAll(resp.Success)
+		}
+	}
+}
+
+// pipelineReplicate is used when we have synchronized our state with the follower,
+// and want to switch to a higher performance pipeline mode of replication.
+// We only pipeline AppendEntries commands, and if we ever hit an error, we fall
+// back to the standard replication which can handle more complex situations.
+func (r *Raft) pipelineReplicate(s *followerReplication) error {
+	// Create a new pipeline
+	pipeline, err := r.trans.AppendEntriesPipeline(s.peer.ID, s.peer.Address)
+	if err != nil {
+		return err
+	}
+	defer pipeline.Close()
+
+	// Log start and stop of pipeline
+	r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer)
+	defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer)
+
+	// Create a shutdown and finish channel
+	stopCh := make(chan struct{})
+	finishCh := make(chan struct{})
+
+	// Start a dedicated decoder
+	r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) })
+
+	// Start pipeline sends at the last good nextIndex
+	nextIndex := s.nextIndex
+
+	shouldStop := false
+SEND:
+	for !shouldStop {
+		select {
+		case <-finishCh:
+			break SEND
+		case maxIndex := <-s.stopCh:
+			// Make a best effort to replicate up to this index
+			if maxIndex > 0 {
+				r.pipelineSend(s, pipeline, &nextIndex, maxIndex)
+			}
+			break SEND
+		case <-s.triggerCh:
+			lastLogIdx, _ := r.getLastLog()
+			shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx)
+		case <-randomTimeout(r.conf.CommitTimeout):
+			lastLogIdx, _ := r.getLastLog()
+			shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx)
+		}
+	}
+
+	// Stop our decoder, and wait for it to finish
+	close(stopCh)
+	select {
+	case <-finishCh:
+	case <-r.shutdownCh:
+	}
+	return nil
+}
+
+// pipelineSend is used to send data over a pipeline. It is a helper to
+// pipelineReplicate.
+func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) {
+	// Create a new append request
+	req := new(AppendEntriesRequest)
+	if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil {
+		return true
+	}
+
+	// Pipeline the append entries
+	if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil {
+		r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err)
+		return true
+	}
+
+	// Increase the next send log to avoid re-sending old logs
+	if n := len(req.Entries); n > 0 {
+		last := req.Entries[n-1]
+		*nextIdx = last.Index + 1
+	}
+	return false
+}
+
+// pipelineDecode is used to decode the responses of pipelined requests.
+func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) {
+	defer close(finishCh)
+	respCh := p.Consumer()
+	for {
+		select {
+		case ready := <-respCh:
+			req, resp := ready.Request(), ready.Response()
+			appendStats(string(s.peer.ID), ready.Start(), float32(len(req.Entries)))
+
+			// Check for a newer term, stop running
+			if resp.Term > req.Term {
+				r.handleStaleTerm(s)
+				return
+			}
+
+			// Update the last contact
+			s.setLastContact()
+
+			// Abort pipeline if not successful
+			if !resp.Success {
+				return
+			}
+
+			// Update our replication state
+			updateLastAppended(s, req)
+		case <-stopCh:
+			return
+		}
+	}
+}
+
+// setupAppendEntries is used to setup an append entries request.
+func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error {
+	req.RPCHeader = r.getRPCHeader()
+	req.Term = s.currentTerm
+	req.Leader = r.trans.EncodePeer(r.localID, r.localAddr)
+	req.LeaderCommitIndex = r.getCommitIndex()
+	if err := r.setPreviousLog(req, nextIndex); err != nil {
+		return err
+	}
+	if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil {
+		return err
+	}
+	return nil
+}
+
+// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an
+// AppendEntriesRequest given the next index to replicate.
+func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error {
+	// Guard for the first index, since there is no 0 log entry
+	// Guard against the previous index being a snapshot as well
+	lastSnapIdx, lastSnapTerm := r.getLastSnapshot()
+	if nextIndex == 1 {
+		req.PrevLogEntry = 0
+		req.PrevLogTerm = 0
+
+	} else if (nextIndex - 1) == lastSnapIdx {
+		req.PrevLogEntry = lastSnapIdx
+		req.PrevLogTerm = lastSnapTerm
+
+	} else {
+		var l Log
+		if err := r.logs.GetLog(nextIndex-1, &l); err != nil {
+			r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v",
+				nextIndex-1, err)
+			return err
+		}
+
+		// Set the previous index and term (0 if nextIndex is 1)
+		req.PrevLogEntry = l.Index
+		req.PrevLogTerm = l.Term
+	}
+	return nil
+}
+
+// setNewLogs is used to setup the logs which should be appended for a request.
+func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error {
+	// Append up to MaxAppendEntries or up to the lastIndex
+	req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries)
+	maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex)
+	for i := nextIndex; i <= maxIndex; i++ {
+		oldLog := new(Log)
+		if err := r.logs.GetLog(i, oldLog); err != nil {
+			r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err)
+			return err
+		}
+		req.Entries = append(req.Entries, oldLog)
+	}
+	return nil
+}
+
+// appendStats is used to emit stats about an AppendEntries invocation.
+func appendStats(peer string, start time.Time, logs float32) {
+	metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start)
+	metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs)
+}
+
+// handleStaleTerm is used when a follower indicates that we have a stale term.
+func (r *Raft) handleStaleTerm(s *followerReplication) {
+	r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer)
+	s.notifyAll(false) // No longer leader
+	asyncNotifyCh(s.stepDown)
+}
+
+// updateLastAppended is used to update follower replication state after a
+// successful AppendEntries RPC.
+// TODO: This isn't used during InstallSnapshot, but the code there is similar.
+func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) {
+	// Mark any inflight logs as committed
+	if logs := req.Entries; len(logs) > 0 {
+		last := logs[len(logs)-1]
+		s.nextIndex = last.Index + 1
+		s.commitment.match(s.peer.ID, last.Index)
+	}
+
+	// Notify still leader
+	s.notifyAll(true)
+}