Refactor Nomad Recovery

from an approach that loaded the runners only once at the startup
to a method that will be repeated i.e. if the Nomad Event Stream connection interrupts.
This commit is contained in:
Maximilian Paß
2023-10-23 14:36:14 +02:00
committed by Sebastian Serth
parent b2898f9183
commit 6b69a2d732
22 changed files with 211 additions and 120 deletions

View File

@ -58,7 +58,7 @@ func (n *AbstractManager) Delete(id dto.EnvironmentID) (bool, error) {
}
n.runnerManager.DeleteEnvironment(id)
if err := e.Delete(false); err != nil {
if err := e.Delete(runner.ErrDestroyedByAPIRequest); err != nil {
return true, fmt.Errorf("could not delete environment: %w", err)
}
return true, nil

View File

@ -45,7 +45,7 @@ func (a *AWSEnvironment) SetImage(awsEndpoint string) {
a.awsEndpoint = awsEndpoint
}
func (a *AWSEnvironment) Delete(_ bool) error {
func (a *AWSEnvironment) Delete(_ runner.DestroyReason) error {
return nil
}

View File

@ -15,9 +15,7 @@ type AWSEnvironmentManager struct {
}
func NewAWSEnvironmentManager(runnerManager runner.Manager) *AWSEnvironmentManager {
m := &AWSEnvironmentManager{&AbstractManager{nil, runnerManager}}
runnerManager.Load()
return m
return &AWSEnvironmentManager{&AbstractManager{nil, runnerManager}}
}
func (a *AWSEnvironmentManager) List(fetch bool) ([]runner.ExecutionEnvironment, error) {

View File

@ -219,13 +219,15 @@ func (n *NomadEnvironment) Register() error {
return nil
}
func (n *NomadEnvironment) Delete(local bool) error {
func (n *NomadEnvironment) Delete(reason runner.DestroyReason) error {
n.cancel()
if !local {
err := n.removeRunners()
if err != nil {
return err
}
err := n.removeRunners(reason)
if err != nil {
return err
}
if !errors.Is(reason, runner.ErrLocalDestruction) {
err = n.apiClient.DeleteJob(*n.job.ID)
if err != nil {
return fmt.Errorf("couldn't delete environment job: %w", err)
@ -238,7 +240,10 @@ func (n *NomadEnvironment) ApplyPrewarmingPoolSize() error {
required := int(n.PrewarmingPoolSize()) - int(n.idleRunners.Length())
if required < 0 {
return fmt.Errorf("%w. Runners to remove: %d", ErrScaleDown, -required)
log.WithError(ErrScaleDown).
WithField(dto.KeyEnvironmentID, n.ID().ToString()).
WithField("offset", -required).Warn("Too many idle runner")
return nil
}
return n.createRunners(uint(required), true)
}
@ -260,6 +265,12 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
}
func (n *NomadEnvironment) AddRunner(r runner.Runner) {
if replacedRunner, ok := n.idleRunners.Get(r.ID()); ok {
err := replacedRunner.Destroy(runner.ErrDestroyedAndReplaced)
if err != nil {
log.WithError(err).Warn("failed removing runner before replacing it")
}
}
n.idleRunners.Add(r.ID(), r)
}
@ -364,10 +375,19 @@ func (n *NomadEnvironment) createRunner(forcePull bool) error {
}
// removeRunners removes all (idle and used) runners for the given environment n.
func (n *NomadEnvironment) removeRunners() error {
func (n *NomadEnvironment) removeRunners(reason runner.DestroyReason) error {
// This prevents a race condition where the number of required runners is miscalculated in the up-scaling process
// based on the number of allocation that has been stopped at the moment of the scaling.
n.idleRunners.Purge()
for _, r := range n.idleRunners.List() {
n.idleRunners.Delete(r.ID())
if err := r.Destroy(runner.ErrLocalDestruction); err != nil {
log.WithError(err).Warn("failed to remove runner locally")
}
}
if errors.Is(reason, runner.ErrLocalDestruction) {
return nil
}
ids, err := n.apiClient.LoadRunnerIDs(nomad.RunnerJobID(n.ID(), ""))
if err != nil {

View File

@ -165,7 +165,7 @@ func (s *MainTestSuite) TestParseJob() {
environment, err := NewNomadEnvironment(tests.DefaultEnvironmentIDAsInteger, apiMock, templateEnvironmentJobHCL)
s.NoError(err)
s.NotNil(environment.job)
s.NoError(environment.Delete(false))
s.NoError(environment.Delete(tests.ErrCleanupDestroyReason))
})
s.Run("returns error when given wrong job", func() {
@ -231,7 +231,7 @@ func (s *MainTestSuite) TestNomadEnvironment_DeleteLocally() {
environment, err := NewNomadEnvironment(tests.DefaultEnvironmentIDAsInteger, apiMock, templateEnvironmentJobHCL)
s.Require().NoError(err)
err = environment.Delete(true)
err = environment.Delete(runner.ErrLocalDestruction)
s.NoError(err)
apiMock.AssertExpectations(s.T())
}

View File

@ -3,6 +3,7 @@ package environment
import (
"context"
_ "embed"
"errors"
"fmt"
nomadApi "github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
@ -13,6 +14,7 @@ import (
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util"
"math"
"os"
"time"
)
@ -36,7 +38,6 @@ func NewNomadEnvironmentManager(
runnerManager runner.Manager,
apiClient nomad.ExecutorAPI,
templateJobFile string,
ctx context.Context,
) (*NomadEnvironmentManager, error) {
if err := loadTemplateEnvironmentJobHCL(templateJobFile); err != nil {
return nil, err
@ -44,10 +45,6 @@ func NewNomadEnvironmentManager(
m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager},
apiClient, templateEnvironmentJobHCL}
if err := util.RetryExponentialWithContext(ctx, func() error { return m.Load() }); err != nil {
log.WithError(err).Error("Error recovering the execution environments")
}
runnerManager.Load()
return m, nil
}
@ -72,7 +69,8 @@ func (m *NomadEnvironmentManager) Get(id dto.EnvironmentID, fetch bool) (
ok = true
default:
executionEnvironment.SetConfigFrom(fetchedEnvironment)
err = fetchedEnvironment.Delete(true)
// We destroy only this (second) local reference to the environment.
err = fetchedEnvironment.Delete(runner.ErrDestroyedAndReplaced)
if err != nil {
log.WithError(err).Warn("Failed to remove environment locally")
}
@ -113,7 +111,7 @@ func (m *NomadEnvironmentManager) fetchEnvironments() error {
fetchedEnvironment := newNomadEnvironmentFromJob(job, m.api)
localEnvironment.SetConfigFrom(fetchedEnvironment)
// We destroy only this (second) local reference to the environment.
if err = fetchedEnvironment.Delete(true); err != nil {
if err = fetchedEnvironment.Delete(runner.ErrDestroyedAndReplaced); err != nil {
log.WithError(err).Warn("Failed to remove environment locally")
}
} else {
@ -124,7 +122,7 @@ func (m *NomadEnvironmentManager) fetchEnvironments() error {
// Remove local environments that are not remote environments.
for _, localEnvironment := range m.runnerManager.ListEnvironments() {
if _, ok := remoteEnvironments[localEnvironment.ID().ToString()]; !ok {
err := localEnvironment.Delete(true)
err := localEnvironment.Delete(runner.ErrLocalDestruction)
log.WithError(err).Warn("Failed to remove environment locally")
}
}
@ -138,7 +136,7 @@ func (m *NomadEnvironmentManager) CreateOrUpdate(
if isExistingEnvironment {
// Remove existing environment to force downloading the newest Docker image.
// See https://github.com/openHPI/poseidon/issues/69
err = environment.Delete(false)
err = environment.Delete(runner.ErrEnvironmentUpdated)
if err != nil {
return false, fmt.Errorf("failed to remove the environment: %w", err)
}
@ -172,7 +170,39 @@ func (m *NomadEnvironmentManager) CreateOrUpdate(
return !isExistingEnvironment, nil
}
func (m *NomadEnvironmentManager) Load() error {
// KeepEnvironmentsSynced loads all environments, runner existing at Nomad, and watches Nomad events to handle further changes.
func (m *NomadEnvironmentManager) KeepEnvironmentsSynced(synchronizeRunners func(ctx context.Context) error, ctx context.Context) {
err := util.RetryConstantAttemptsWithContext(math.MaxInt, ctx, func() error {
// Load Environments
if err := m.load(); err != nil {
log.WithContext(ctx).WithError(err).Warn("Loading Environments failed! Retrying...")
return err
}
// Load Runners and keep them synchronized.
if err := synchronizeRunners(ctx); err != nil {
log.WithContext(ctx).WithError(err).Warn("Loading and synchronizing Runners failed! Retrying...")
return err
}
return nil
})
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
log.WithContext(ctx).WithError(err).Fatal("Stopped KeepEnvironmentsSynced")
}
}
// Load recovers all environments from the Jobs in Nomad.
// As it replaces the environments the idle runners stored are not tracked anymore.
func (m *NomadEnvironmentManager) load() error {
// We have to destroy the environments first as otherwise they would just be replaced and old goroutines might stay running.
for _, environment := range m.runnerManager.ListEnvironments() {
err := environment.Delete(runner.ErrDestroyedAndReplaced)
if err != nil {
log.WithError(err).Warn("Failed deleting environment locally. Possible memory leak")
}
}
templateJobs, err := m.api.LoadEnvironmentJobs()
if err != nil {
return fmt.Errorf("couldn't load template jobs: %w", err)

View File

@ -96,9 +96,6 @@ func (s *CreateOrUpdateTestSuite) TestCreateOrUpdatesSetsForcePullFlag() {
}
func (s *MainTestSuite) TestNewNomadEnvironmentManager() {
disableRecovery, cancel := context.WithCancel(context.Background())
cancel()
executorAPIMock := &nomad.ExecutorAPIMock{}
executorAPIMock.On("LoadEnvironmentJobs").Return([]*nomadApi.Job{}, nil)
executorAPIMock.On("LoadRunnerIDs", mock.AnythingOfType("string")).Return([]string{}, nil)
@ -110,7 +107,7 @@ func (s *MainTestSuite) TestNewNomadEnvironmentManager() {
previousTemplateEnvironmentJobHCL := templateEnvironmentJobHCL
s.Run("returns error if template file does not exist", func() {
_, err := NewNomadEnvironmentManager(runnerManagerMock, executorAPIMock, "/non-existent/file", disableRecovery)
_, err := NewNomadEnvironmentManager(runnerManagerMock, executorAPIMock, "/non-existent/file")
s.Error(err)
})
@ -122,12 +119,12 @@ func (s *MainTestSuite) TestNewNomadEnvironmentManager() {
f := createTempFile(s.T(), templateJobHCL)
defer os.Remove(f.Name())
m, err := NewNomadEnvironmentManager(runnerManagerMock, executorAPIMock, f.Name(), disableRecovery)
m, err := NewNomadEnvironmentManager(runnerManagerMock, executorAPIMock, f.Name())
s.NoError(err)
s.NotNil(m)
s.Equal(templateJobHCL, m.templateEnvironmentHCL)
s.NoError(environment.Delete(false))
s.NoError(environment.Delete(tests.ErrCleanupDestroyReason))
})
s.Run("returns error if template file is invalid", func() {
@ -135,7 +132,7 @@ func (s *MainTestSuite) TestNewNomadEnvironmentManager() {
f := createTempFile(s.T(), templateJobHCL)
defer os.Remove(f.Name())
m, err := NewNomadEnvironmentManager(runnerManagerMock, executorAPIMock, f.Name(), disableRecovery)
m, err := NewNomadEnvironmentManager(runnerManagerMock, executorAPIMock, f.Name())
s.Require().NoError(err)
_, err = NewNomadEnvironment(tests.DefaultEnvironmentIDAsInteger, nil, m.templateEnvironmentHCL)
s.Error(err)
@ -145,9 +142,6 @@ func (s *MainTestSuite) TestNewNomadEnvironmentManager() {
}
func (s *MainTestSuite) TestNomadEnvironmentManager_Get() {
disableRecovery, cancel := context.WithCancel(context.Background())
cancel()
apiMock := &nomad.ExecutorAPIMock{}
mockWatchAllocations(s.TestCtx, apiMock)
apiMock.On("LoadRunnerIDs", mock.AnythingOfType("string")).Return([]string{}, nil)
@ -158,7 +152,7 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_Get() {
})
runnerManager := runner.NewNomadRunnerManager(apiMock, s.TestCtx)
m, err := NewNomadEnvironmentManager(runnerManager, apiMock, "", disableRecovery)
m, err := NewNomadEnvironmentManager(runnerManager, apiMock, "")
s.Require().NoError(err)
s.Run("Returns error when not found", func() {
@ -177,7 +171,7 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_Get() {
s.NoError(err)
s.Equal(expectedEnvironment, environment)
err = environment.Delete(false)
err = environment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
})
@ -211,11 +205,11 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_Get() {
s.NoError(err)
s.Equal(fetchedEnvironment.Image(), environment.Image())
err = fetchedEnvironment.Delete(false)
err = fetchedEnvironment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
err = environment.Delete(false)
err = environment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
err = localEnvironment.Delete(false)
err = localEnvironment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
})
runnerManager.DeleteEnvironment(tests.DefaultEnvironmentIDAsInteger)
@ -237,18 +231,15 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_Get() {
s.NoError(err)
s.Equal(fetchedEnvironment.Image(), environment.Image())
err = fetchedEnvironment.Delete(false)
err = fetchedEnvironment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
err = environment.Delete(false)
err = environment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
})
})
}
func (s *MainTestSuite) TestNomadEnvironmentManager_List() {
disableRecovery, cancel := context.WithCancel(context.Background())
cancel()
apiMock := &nomad.ExecutorAPIMock{}
apiMock.On("LoadRunnerIDs", mock.AnythingOfType("string")).Return([]string{}, nil)
apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
@ -259,7 +250,7 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_List() {
})
runnerManager := runner.NewNomadRunnerManager(apiMock, s.TestCtx)
m, err := NewNomadEnvironmentManager(runnerManager, apiMock, "", disableRecovery)
m, err := NewNomadEnvironmentManager(runnerManager, apiMock, "")
s.Require().NoError(err)
s.Run("with no environments", func() {
@ -279,7 +270,7 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_List() {
s.Equal(1, len(environments))
s.Equal(localEnvironment, environments[0])
err = localEnvironment.Delete(false)
err = localEnvironment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
})
runnerManager.DeleteEnvironment(tests.DefaultEnvironmentIDAsInteger)
@ -306,9 +297,9 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_List() {
s.True(ok)
s.Equal(fetchedEnvironment.job, nomadEnvironment.job)
err = fetchedEnvironment.Delete(false)
err = fetchedEnvironment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
err = nomadEnvironment.Delete(false)
err = nomadEnvironment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
})
}
@ -331,14 +322,17 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_Load() {
_, ok := runnerManager.GetEnvironment(tests.DefaultEnvironmentIDAsInteger)
s.Require().False(ok)
_, err := NewNomadEnvironmentManager(runnerManager, apiMock, "", s.TestCtx)
m, err := NewNomadEnvironmentManager(runnerManager, apiMock, "")
s.Require().NoError(err)
err = m.load()
s.Require().NoError(err)
environment, ok := runnerManager.GetEnvironment(tests.DefaultEnvironmentIDAsInteger)
s.Require().True(ok)
s.Equal("python:latest", environment.Image())
err = environment.Delete(false)
err = environment.Delete(tests.ErrCleanupDestroyReason)
s.Require().NoError(err)
})
@ -352,7 +346,7 @@ func (s *MainTestSuite) TestNomadEnvironmentManager_Load() {
_, ok := runnerManager.GetEnvironment(tests.DefaultEnvironmentIDAsInteger)
s.Require().False(ok)
_, err := NewNomadEnvironmentManager(runnerManager, apiMock, "", context.Background())
_, err := NewNomadEnvironmentManager(runnerManager, apiMock, "")
s.Require().NoError(err)
_, ok = runnerManager.GetEnvironment(tests.DefaultEnvironmentIDAsInteger)