Add updating cached allocations

2021-05-31 16:34:28 +02:00
parent 66821dbfc8
commit 3f572261c2
8 changed files with 213 additions and 15 deletions
--- a/nomad/nomad.go
+++ b/nomad/nomad.go
@@ -9,6 +9,7 @@ import (
 	"gitlab.hpi.de/codeocean/codemoon/poseidon/logging"
 	"net/url"
 	"strings"
+	"time"
 )

 var (
@@ -16,6 +17,8 @@ var (
 	ErrorExecutorCommunicationFailed = errors.New("communication with executor failed")
 )

+type allocationProcessor func(*nomadApi.Allocation)
+
 // ExecutorApi provides access to an container orchestration solution
 type ExecutorApi interface {
 	apiQuerier
@@ -28,6 +31,10 @@ type ExecutorApi interface {
 	// If the evaluation was not successful, an error containing the failures is returned.
 	// See also https://github.com/hashicorp/nomad/blob/7d5a9ecde95c18da94c9b6ace2565afbfdd6a40d/command/monitor.go#L175
 	MonitorEvaluation(evalID string, ctx context.Context) error
+
+	// WatchAllocations listens on the Nomad event stream for allocation events.
+	// Depending on the incoming event, any of the given function is executed.
+	WatchAllocations(ctx context.Context, onNewAllocation, onDeletedAllocation allocationProcessor) error
 }

 // ApiClient implements the ExecutorApi interface and can be used to perform different operations on the real Executor API and its return values.
@@ -74,26 +81,100 @@ func (a *ApiClient) MonitorEvaluation(evalID string, ctx context.Context) error
 		return err
 	}
 	// If ctx is cancelled, the stream will be closed by Nomad and we exit the for loop.
+	return receiveAndHandleNomadAPIEvents(stream, handleEvaluationEvent)
+}
+
+func (a *ApiClient) WatchAllocations(ctx context.Context, onNewAllocation, onDeletedAllocation allocationProcessor) error {
+	startTime := time.Now().UnixNano()
+	stream, err := a.AllocationStream(ctx)
+	if err != nil {
+		return fmt.Errorf("failed retrieving allocation stream: %w", err)
+	}
+	waitingToRun := make(map[string]bool)
+
+	handler := func(event nomadApi.Event) error {
+		return handleAllocationEvent(startTime, waitingToRun, event, onNewAllocation, onDeletedAllocation)
+	}
+
+	err = receiveAndHandleNomadAPIEvents(stream, handler)
+	return err
+}
+
+type nomadAPIEventHandler func(event nomadApi.Event) error
+
+// receiveAndHandleNomadAPIEvents receives events from the Nomad event stream and calls the handler function for each received
+// event. It skips heartbeat events and returns an error if the received events contain an error.
+func receiveAndHandleNomadAPIEvents(stream <-chan *nomadApi.Events, handler nomadAPIEventHandler) error {
+	// If original context is cancelled, the stream will be closed by Nomad and we exit the for loop.
 	for events := range stream {
 		if events.IsHeartbeat() {
 			continue
 		}
 		if err := events.Err; err != nil {
-			log.WithError(err).Warn("Error monitoring evaluation")
-			return err
+			return fmt.Errorf("error receiving events: %w", err)
 		}
 		for _, event := range events.Events {
-			eval, err := event.Evaluation()
-			if err != nil {
-				log.WithError(err).Warn("Error retrieving evaluation from streamed event")
+			// TODO: we can't break out of this function from inside the handler
+			if err := handler(event); err != nil {
 				return err
 			}
-			switch eval.Status {
-			case structs.EvalStatusComplete, structs.EvalStatusCancelled, structs.EvalStatusFailed:
-				return checkEvaluation(eval)
-			default:
+		}
+	}
+	return nil
+}
+
+// handleEvaluationEvent is a nomadAPIEventHandler that returns the status of an evaluation in the event.
+func handleEvaluationEvent(event nomadApi.Event) error {
+	eval, err := event.Evaluation()
+	if err != nil {
+		return fmt.Errorf("failed monitoring evaluation: %w", err)
+	}
+	switch eval.Status {
+	case structs.EvalStatusComplete, structs.EvalStatusCancelled, structs.EvalStatusFailed:
+		return checkEvaluation(eval)
+	default:
+	}
+	return nil
+}
+
+// handleAllocationEvent is a nomadAPIEventHandler that processes allocation events.
+// If a new allocation is received, onNewAllocation is called. If an allocation is deleted, onDeletedAllocation
+// is called. The waitingToRun map is used to store allocations that are pending but not started yet. Using the map
+// the state is persisted between multiple calls of this function.
+func handleAllocationEvent(startTime int64, waitingToRun map[string]bool, event nomadApi.Event,
+	onNewAllocation, onDeletedAllocation allocationProcessor) error {
+	alloc, err := event.Allocation()
+	if err != nil {
+		return fmt.Errorf("failed retrieving allocation from event %v: %w", event, err)
+	}
+	if alloc == nil || event.Type == structs.TypePlanResult {
+		return nil
+	}
+
+	if event.Type == structs.TypeAllocationUpdated {
+		// When starting the API and listening on the Nomad event stream we might get events that already
+		// happened from Nomad as it seems to buffer them for a certain duration.
+		// Ignore old events here.
+		if alloc.ModifyTime < startTime {
+			return nil
+		}
+
+	if alloc.ClientStatus == structs.AllocClientStatusRunning {
+		switch alloc.DesiredStatus {
+		case structs.AllocDesiredStatusStop:
+			onDeletedAllocation(alloc)
+		case structs.AllocDesiredStatusRun:
+			// first event that marks the transition between pending and running
+			_, ok := pendingAllocations[alloc.ID]
+			if ok {
+				onNewAllocation(alloc)
+				delete(pendingAllocations, alloc.ID)
 			}
 		}
+		if alloc.ClientStatus == structs.AllocClientStatusPending && alloc.DesiredStatus == structs.AllocDesiredStatusRun {
+			// allocation is started, wait until it runs and add to our list afterwards
+			waitingToRun[alloc.ID] = true
+		}
 	}
 	return nil
 }