Add Nomad job registration with monitoring afterwards

Once a Nomad job is registered, we listen to the Nomad event stream and return once we find the evaluation to complete.
2021-05-26 12:46:54 +02:00
parent 4c3cc0cc4c
commit f228a3e599
7 changed files with 567 additions and 35 deletions
--- a/nomad/nomad.go
+++ b/nomad/nomad.go
@@ -1,16 +1,30 @@
 package nomad

 import (
+	"context"
+	"errors"
+	"fmt"
 	nomadApi "github.com/hashicorp/nomad/api"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"gitlab.hpi.de/codeocean/codemoon/poseidon/logging"
 	"net/url"
+	"strings"
 )

+var log = logging.GetLogger("nomad")
+
 // ExecutorApi provides access to an container orchestration solution
 type ExecutorApi interface {
 	apiQuerier

 	// LoadRunners loads all allocations of the specified job which are running and not about to get stopped.
 	LoadRunners(jobId string) (runnerIds []string, err error)
+
+	// MonitorEvaluation monitors the given evaluation ID.
+	// It waits until the evaluation reaches one of the states complete, cancelled or failed.
+	// If the evaluation was not successful, an error containing the failures is returned.
+	// See also https://github.com/hashicorp/nomad/blob/7d5a9ecde95c18da94c9b6ace2565afbfdd6a40d/command/monitor.go#L175
+	MonitorEvaluation(evalID string, ctx context.Context) error
 }

 // ApiClient implements the ExecutorApi interface and can be used to perform different operations on the real Executor API and its return values.
@@ -27,8 +41,8 @@ func NewExecutorApi(nomadURL *url.URL, nomadNamespace string) (ExecutorApi, erro
 }

 // init prepares an apiClient to be able to communicate to a provided Nomad API.
-func (apiClient *ApiClient) init(nomadURL *url.URL, nomadNamespace string) (err error) {
-	err = apiClient.apiQuerier.init(nomadURL, nomadNamespace)
+func (a *ApiClient) init(nomadURL *url.URL, nomadNamespace string) (err error) {
+	err = a.apiQuerier.init(nomadURL, nomadNamespace)
 	if err != nil {
 		return err
 	}
@@ -36,9 +50,9 @@ func (apiClient *ApiClient) init(nomadURL *url.URL, nomadNamespace string) (err
 }

 // LoadRunners loads the allocations of the specified job.
-func (apiClient *ApiClient) LoadRunners(jobId string) (runnerIds []string, err error) {
+func (a *ApiClient) LoadRunners(jobId string) (runnerIds []string, err error) {
 	//list, _, err := apiClient.client.Jobs().Allocations(jobId, true, nil)
-	list, err := apiClient.loadRunners(jobId)
+	list, err := a.loadRunners(jobId)
 	if err != nil {
 		return nil, err
 	}
@@ -50,3 +64,59 @@ func (apiClient *ApiClient) LoadRunners(jobId string) (runnerIds []string, err e
 	}
 	return
 }
+
+func (a *ApiClient) MonitorEvaluation(evalID string, ctx context.Context) error {
+	var events *nomadApi.Events
+	stream, err := a.EvaluationStream(evalID, ctx)
+	if err != nil {
+		return err
+	}
+	for {
+		select {
+		case events = <-stream:
+		case <-ctx.Done():
+			return nil
+		}
+		if events.IsHeartbeat() {
+			continue
+		}
+		if err := events.Err; err != nil {
+			log.WithError(err).Warn("Error monitoring evaluation")
+			return err
+		}
+		for _, event := range events.Events {
+			eval, err := event.Evaluation()
+			if err != nil {
+				log.WithError(err).Warn("Error retrieving evaluation from streamed event")
+				return err
+			}
+			switch eval.Status {
+			case structs.EvalStatusComplete, structs.EvalStatusCancelled, structs.EvalStatusFailed:
+				return checkEvaluation(eval)
+			default:
+			}
+		}
+	}
+}
+
+// checkEvaluation checks whether the given evaluation failed.
+// If the evaluation failed, it returns an error with a message containing the failure information.
+func checkEvaluation(eval *nomadApi.Evaluation) error {
+	if len(eval.FailedTGAllocs) == 0 {
+		if eval.Status == structs.EvalStatusComplete {
+			return nil
+		}
+		return fmt.Errorf("evaluation could not complete: %q", eval.Status)
+	} else {
+		messages := []string{
+			fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations.", eval.ID, eval.Status),
+		}
+		for tg, metrics := range eval.FailedTGAllocs {
+			messages = append(messages, fmt.Sprintf("%s: %#v", tg, metrics))
+		}
+		if eval.BlockedEval != "" {
+			messages = append(messages, fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder", eval.BlockedEval))
+		}
+		return errors.New(strings.Join(messages, "\n"))
+	}
+}