Recreate runners on execution environment update

Solves #69 and #48
This commit is contained in:
Maximilian Paß
2021-12-16 18:21:21 +01:00
parent 3abfdb6cc6
commit 0571b10b5c
5 changed files with 50 additions and 43 deletions

View File

@@ -3,7 +3,6 @@ package environment
import (
"context"
"encoding/json"
"errors"
"fmt"
"github.com/google/uuid"
nomadApi "github.com/hashicorp/nomad/api"
@@ -12,16 +11,13 @@ import (
"github.com/openHPI/poseidon/internal/runner"
"github.com/openHPI/poseidon/pkg/dto"
"strconv"
"sync"
)
const (
portNumberBase = 10
)
var (
ErrorUpdatingExecutionEnvironment = errors.New("errors occurred when updating environment")
)
type NomadEnvironment struct {
jobHCL string
job *nomadApi.Job
@@ -173,6 +169,11 @@ func (n *NomadEnvironment) SetNetworkAccess(allow bool, exposedPorts []uint16) {
// Register creates a Nomad job based on the default job configuration and the given parameters.
// It registers the job with Nomad and waits until the registration completes.
func (n *NomadEnvironment) Register(apiClient nomad.ExecutorAPI) error {
// To avoid docker image issues. See https://github.com/openHPI/poseidon/issues/69
if err := n.Delete(apiClient); err != nil {
return fmt.Errorf("failed to remove the environment: %w", err)
}
nomad.SetForcePullFlag(n.job, true) // This must be the default as otherwise new runners could have different images.
evalID, err := apiClient.RegisterNomadJob(n.job)
if err != nil {
@@ -188,7 +189,7 @@ func (n *NomadEnvironment) Register(apiClient nomad.ExecutorAPI) error {
}
func (n *NomadEnvironment) Delete(apiClient nomad.ExecutorAPI) error {
err := n.removeRunners(apiClient, uint(n.idleRunners.Length()))
err := n.removeRunners(apiClient)
if err != nil {
return err
}
@@ -205,36 +206,10 @@ func (n *NomadEnvironment) Scale(apiClient nomad.ExecutorAPI) error {
if required > 0 {
return n.createRunners(apiClient, uint(required), true)
} else {
return n.removeRunners(apiClient, uint(-required))
return n.removeIdleRunners(apiClient, uint(-required))
}
}
func (n *NomadEnvironment) UpdateRunnerSpecs(apiClient nomad.ExecutorAPI) error {
runners, err := apiClient.LoadRunnerIDs(n.ID().ToString())
if err != nil {
return fmt.Errorf("update environment couldn't load runners: %w", err)
}
var occurredError error
for _, id := range runners {
// avoid taking the address of the loop variable
runnerID := id
updatedRunnerJob := n.DeepCopyJob()
updatedRunnerJob.ID = &runnerID
updatedRunnerJob.Name = &runnerID
nomad.SetForcePullFlag(updatedRunnerJob, true)
err := apiClient.RegisterRunnerJob(updatedRunnerJob)
if err != nil {
if occurredError == nil {
occurredError = ErrorUpdatingExecutionEnvironment
}
occurredError = fmt.Errorf("%w; new api error for runner %s - %v", occurredError, id, err)
}
}
return occurredError
}
func (n *NomadEnvironment) Sample(apiClient nomad.ExecutorAPI) (runner.Runner, bool) {
r, ok := n.idleRunners.Sample()
if ok {
@@ -347,7 +322,7 @@ func (n *NomadEnvironment) createRunner(apiClient nomad.ExecutorAPI, forcePull b
return nil
}
func (n *NomadEnvironment) removeRunners(apiClient nomad.ExecutorAPI, count uint) error {
func (n *NomadEnvironment) removeIdleRunners(apiClient nomad.ExecutorAPI, count uint) error {
log.WithField("runnersToDelete", count).WithField("id", n.ID()).Debug("Removing idle runners")
for i := 0; i < int(count); i++ {
r, ok := n.idleRunners.Sample()
@@ -361,3 +336,29 @@ func (n *NomadEnvironment) removeRunners(apiClient nomad.ExecutorAPI, count uint
}
return nil
}
func (n *NomadEnvironment) removeRunners(apiClient nomad.ExecutorAPI) error {
// Only to avoid timing issues as an idle runner is also removed when Nomad has deleted the allocation.
for _, r := range n.idleRunners.List() {
n.idleRunners.Delete(r.ID())
}
ids, err := apiClient.LoadRunnerIDs(nomad.RunnerJobID(n.ID(), ""))
if err != nil {
return fmt.Errorf("failed to load runner ids: %w", err)
}
var wg sync.WaitGroup
for _, id := range ids {
wg.Add(1)
go func(jobID string) {
defer wg.Done()
deleteErr := apiClient.DeleteJob(jobID)
if deleteErr != nil {
err = deleteErr
}
}(id)
}
wg.Wait()
return err
}