Add Nomad job registration with monitoring afterwards
Once a Nomad job is registered, we listen to the Nomad event stream and return once we find the evaluation to complete.
This commit is contained in:

committed by
Tobias Kantusch

parent
4c3cc0cc4c
commit
f228a3e599
@@ -1,16 +1,30 @@
|
||||
package nomad
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
nomadApi "github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"gitlab.hpi.de/codeocean/codemoon/poseidon/logging"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var log = logging.GetLogger("nomad")
|
||||
|
||||
// ExecutorApi provides access to an container orchestration solution
|
||||
type ExecutorApi interface {
|
||||
apiQuerier
|
||||
|
||||
// LoadRunners loads all allocations of the specified job which are running and not about to get stopped.
|
||||
LoadRunners(jobId string) (runnerIds []string, err error)
|
||||
|
||||
// MonitorEvaluation monitors the given evaluation ID.
|
||||
// It waits until the evaluation reaches one of the states complete, cancelled or failed.
|
||||
// If the evaluation was not successful, an error containing the failures is returned.
|
||||
// See also https://github.com/hashicorp/nomad/blob/7d5a9ecde95c18da94c9b6ace2565afbfdd6a40d/command/monitor.go#L175
|
||||
MonitorEvaluation(evalID string, ctx context.Context) error
|
||||
}
|
||||
|
||||
// ApiClient implements the ExecutorApi interface and can be used to perform different operations on the real Executor API and its return values.
|
||||
@@ -27,8 +41,8 @@ func NewExecutorApi(nomadURL *url.URL, nomadNamespace string) (ExecutorApi, erro
|
||||
}
|
||||
|
||||
// init prepares an apiClient to be able to communicate to a provided Nomad API.
|
||||
func (apiClient *ApiClient) init(nomadURL *url.URL, nomadNamespace string) (err error) {
|
||||
err = apiClient.apiQuerier.init(nomadURL, nomadNamespace)
|
||||
func (a *ApiClient) init(nomadURL *url.URL, nomadNamespace string) (err error) {
|
||||
err = a.apiQuerier.init(nomadURL, nomadNamespace)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -36,9 +50,9 @@ func (apiClient *ApiClient) init(nomadURL *url.URL, nomadNamespace string) (err
|
||||
}
|
||||
|
||||
// LoadRunners loads the allocations of the specified job.
|
||||
func (apiClient *ApiClient) LoadRunners(jobId string) (runnerIds []string, err error) {
|
||||
func (a *ApiClient) LoadRunners(jobId string) (runnerIds []string, err error) {
|
||||
//list, _, err := apiClient.client.Jobs().Allocations(jobId, true, nil)
|
||||
list, err := apiClient.loadRunners(jobId)
|
||||
list, err := a.loadRunners(jobId)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -50,3 +64,59 @@ func (apiClient *ApiClient) LoadRunners(jobId string) (runnerIds []string, err e
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func (a *ApiClient) MonitorEvaluation(evalID string, ctx context.Context) error {
|
||||
var events *nomadApi.Events
|
||||
stream, err := a.EvaluationStream(evalID, ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case events = <-stream:
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
}
|
||||
if events.IsHeartbeat() {
|
||||
continue
|
||||
}
|
||||
if err := events.Err; err != nil {
|
||||
log.WithError(err).Warn("Error monitoring evaluation")
|
||||
return err
|
||||
}
|
||||
for _, event := range events.Events {
|
||||
eval, err := event.Evaluation()
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("Error retrieving evaluation from streamed event")
|
||||
return err
|
||||
}
|
||||
switch eval.Status {
|
||||
case structs.EvalStatusComplete, structs.EvalStatusCancelled, structs.EvalStatusFailed:
|
||||
return checkEvaluation(eval)
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkEvaluation checks whether the given evaluation failed.
|
||||
// If the evaluation failed, it returns an error with a message containing the failure information.
|
||||
func checkEvaluation(eval *nomadApi.Evaluation) error {
|
||||
if len(eval.FailedTGAllocs) == 0 {
|
||||
if eval.Status == structs.EvalStatusComplete {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("evaluation could not complete: %q", eval.Status)
|
||||
} else {
|
||||
messages := []string{
|
||||
fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations.", eval.ID, eval.Status),
|
||||
}
|
||||
for tg, metrics := range eval.FailedTGAllocs {
|
||||
messages = append(messages, fmt.Sprintf("%s: %#v", tg, metrics))
|
||||
}
|
||||
if eval.BlockedEval != "" {
|
||||
messages = append(messages, fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder", eval.BlockedEval))
|
||||
}
|
||||
return errors.New(strings.Join(messages, "\n"))
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user