Refactor Nomad Recovery
from an approach that loaded the runners only once at the startup to a method that will be repeated i.e. if the Nomad Event Stream connection interrupts.
This commit is contained in:

committed by
Sebastian Serth

parent
b2898f9183
commit
6b69a2d732
@@ -123,5 +123,3 @@ func (n *AbstractManager) Get(runnerID string) (Runner, error) {
|
||||
func (n *AbstractManager) Return(_ Runner) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n *AbstractManager) Load() {}
|
||||
|
@@ -111,5 +111,6 @@ func createBasicEnvironmentMock(id dto.EnvironmentID) *ExecutionEnvironmentMock
|
||||
environment.On("MemoryLimit").Return(uint(0))
|
||||
environment.On("NetworkAccess").Return(false, nil)
|
||||
environment.On("DeleteRunner", mock.AnythingOfType("string")).Return(false)
|
||||
environment.On("ApplyPrewarmingPoolSize").Return(nil)
|
||||
return environment
|
||||
}
|
||||
|
@@ -39,7 +39,7 @@ type ExecutionEnvironment interface {
|
||||
Register() error
|
||||
// Delete removes this environment and all it's runner from the executor and Poseidon itself.
|
||||
// Iff local the environment is just removed from Poseidon without external escalation.
|
||||
Delete(local bool) error
|
||||
Delete(reason DestroyReason) error
|
||||
|
||||
// Sample returns and removes an arbitrary available runner.
|
||||
// ok is true iff a runner was returned.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
// Code generated by mockery v2.33.2. DO NOT EDIT.
|
||||
// Code generated by mockery v2.36.0. DO NOT EDIT.
|
||||
|
||||
package runner
|
||||
|
||||
@@ -45,13 +45,13 @@ func (_m *ExecutionEnvironmentMock) CPULimit() uint {
|
||||
return r0
|
||||
}
|
||||
|
||||
// Delete provides a mock function with given fields: local
|
||||
func (_m *ExecutionEnvironmentMock) Delete(local bool) error {
|
||||
ret := _m.Called(local)
|
||||
// Delete provides a mock function with given fields: reason
|
||||
func (_m *ExecutionEnvironmentMock) Delete(reason DestroyReason) error {
|
||||
ret := _m.Called(reason)
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func(bool) error); ok {
|
||||
r0 = rf(local)
|
||||
if rf, ok := ret.Get(0).(func(DestroyReason) error); ok {
|
||||
r0 = rf(reason)
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
@@ -51,8 +51,4 @@ type Accessor interface {
|
||||
// Return signals that the runner is no longer used by the caller and can be claimed by someone else.
|
||||
// The runner is deleted or cleaned up for reuse depending on the used executor.
|
||||
Return(r Runner) error
|
||||
|
||||
// Load fetches all already created runners from the executor and registers them.
|
||||
// It should be called during the startup process (e.g. on creation of the Manager).
|
||||
Load()
|
||||
}
|
||||
|
@@ -10,9 +10,9 @@ import (
|
||||
"github.com/openHPI/poseidon/pkg/dto"
|
||||
"github.com/openHPI/poseidon/pkg/logging"
|
||||
"github.com/openHPI/poseidon/pkg/monitoring"
|
||||
"github.com/openHPI/poseidon/pkg/storage"
|
||||
"github.com/openHPI/poseidon/pkg/util"
|
||||
"github.com/sirupsen/logrus"
|
||||
"math"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
@@ -30,12 +30,9 @@ type NomadRunnerManager struct {
|
||||
}
|
||||
|
||||
// NewNomadRunnerManager creates a new runner manager that keeps track of all runners.
|
||||
// It uses the apiClient for all requests and runs a background task to keep the runners in sync with Nomad.
|
||||
// If you cancel the context the background synchronization will be stopped.
|
||||
// KeepRunnersSynced has to be started separately.
|
||||
func NewNomadRunnerManager(apiClient nomad.ExecutorAPI, ctx context.Context) *NomadRunnerManager {
|
||||
m := &NomadRunnerManager{NewAbstractManager(ctx), apiClient}
|
||||
go m.keepRunnersSynced(ctx)
|
||||
return m
|
||||
return &NomadRunnerManager{NewAbstractManager(ctx), apiClient}
|
||||
}
|
||||
|
||||
func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int) (Runner, error) {
|
||||
@@ -80,40 +77,64 @@ func (m *NomadRunnerManager) Return(r Runner) error {
|
||||
return err
|
||||
}
|
||||
|
||||
func (m *NomadRunnerManager) Load() {
|
||||
// SynchronizeRunners loads all runners and keeps them synchronized (without a retry mechanism).
|
||||
func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error {
|
||||
// Load Runners
|
||||
if err := m.load(); err != nil {
|
||||
return fmt.Errorf("failed loading runners: %w", err)
|
||||
}
|
||||
|
||||
// Watch for changes regarding the existing or new runners.
|
||||
err := m.apiClient.WatchEventStream(ctx,
|
||||
&nomad.AllocationProcessing{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped})
|
||||
|
||||
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
|
||||
err = fmt.Errorf("nomad Event Stream failed!: %w", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Load recovers all runners for all existing environments.
|
||||
func (m *NomadRunnerManager) load() error {
|
||||
newUsedRunners := storage.NewLocalStorage[Runner]()
|
||||
|
||||
for _, environment := range m.environments.List() {
|
||||
environmentLogger := log.WithField(dto.KeyEnvironmentID, environment.ID().ToString())
|
||||
|
||||
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID())
|
||||
if err != nil {
|
||||
environmentLogger.WithError(err).Warn("Error fetching the runner jobs")
|
||||
return fmt.Errorf("failed fetching the runner jobs: %w", err)
|
||||
}
|
||||
for _, job := range runnerJobs {
|
||||
m.loadSingleJob(job, environmentLogger, environment)
|
||||
m.loadSingleJob(job, environmentLogger, environment, newUsedRunners)
|
||||
}
|
||||
err = environment.ApplyPrewarmingPoolSize()
|
||||
if err != nil {
|
||||
environmentLogger.WithError(err).Error("Couldn't scale environment")
|
||||
return fmt.Errorf("couldn't scale environment: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
m.updateUsedRunners(newUsedRunners)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger *logrus.Entry,
|
||||
environment ExecutionEnvironment) {
|
||||
environment ExecutionEnvironment, newUsedRunners storage.Storage[Runner]) {
|
||||
configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName)
|
||||
if configTaskGroup == nil {
|
||||
environmentLogger.Infof("Couldn't find config task group in job %s, skipping ...", *job.ID)
|
||||
environmentLogger.Warnf("Couldn't find config task group in job %s, skipping ...", *job.ID)
|
||||
return
|
||||
}
|
||||
isUsed := configTaskGroup.Meta[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue
|
||||
portMappings, err := m.apiClient.LoadRunnerPortMappings(*job.ID)
|
||||
if err != nil {
|
||||
environmentLogger.WithError(err).Warn("Error loading runner portMappings")
|
||||
environmentLogger.WithError(err).Warn("Error loading runner portMappings, skipping ...")
|
||||
return
|
||||
}
|
||||
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.onRunnerDestroyed)
|
||||
log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner")
|
||||
if isUsed {
|
||||
m.usedRunners.Add(newJob.ID(), newJob)
|
||||
newUsedRunners.Add(newJob.ID(), newJob)
|
||||
timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey])
|
||||
if err != nil {
|
||||
environmentLogger.WithError(err).Warn("Error loading timeout from meta values")
|
||||
@@ -125,18 +146,23 @@ func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger
|
||||
}
|
||||
}
|
||||
|
||||
func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
|
||||
err := util.RetryConstantAttemptsWithContext(math.MaxInt, ctx, func() error {
|
||||
err := m.apiClient.WatchEventStream(ctx,
|
||||
&nomad.AllocationProcessing{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped})
|
||||
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
|
||||
log.WithContext(ctx).WithError(err).Errorf("Nomad Event Stream failed! Retrying...")
|
||||
err = fmt.Errorf("KeepRunnersSynced: %w", err)
|
||||
func (m *NomadRunnerManager) updateUsedRunners(newUsedRunners storage.Storage[Runner]) {
|
||||
for _, r := range m.usedRunners.List() {
|
||||
var reason DestroyReason
|
||||
if _, ok := newUsedRunners.Get(r.ID()); ok {
|
||||
reason = ErrDestroyedAndReplaced
|
||||
} else {
|
||||
reason = ErrLocalDestruction
|
||||
log.WithError(reason).WithField(dto.KeyRunnerID, r.ID()).Warn("Local runner cannot be recovered")
|
||||
}
|
||||
return err
|
||||
})
|
||||
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
|
||||
log.WithContext(ctx).WithError(err).Fatal("Stopped Restarting the Nomad Event Stream")
|
||||
m.usedRunners.Delete(r.ID())
|
||||
if err := r.Destroy(reason); err != nil {
|
||||
log.WithError(err).WithField(dto.KeyRunnerID, r.ID()).Warn("failed to destroy runner locally")
|
||||
}
|
||||
}
|
||||
|
||||
for _, r := range newUsedRunners.List() {
|
||||
m.usedRunners.Add(r.ID(), r)
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -59,6 +59,7 @@ func mockRunnerQueries(ctx context.Context, apiMock *nomad.ExecutorAPIMock, retu
|
||||
call.ReturnArguments = mock.Arguments{nil}
|
||||
})
|
||||
apiMock.On("LoadEnvironmentJobs").Return([]*nomadApi.Job{}, nil)
|
||||
apiMock.On("LoadRunnerJobs", mock.AnythingOfType("dto.EnvironmentID")).Return([]*nomadApi.Job{}, nil)
|
||||
apiMock.On("MarkRunnerAsUsed", mock.AnythingOfType("string"), mock.AnythingOfType("int")).Return(nil)
|
||||
apiMock.On("LoadRunnerIDs", tests.DefaultRunnerID).Return(returnedRunnerIds, nil)
|
||||
apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
|
||||
@@ -258,12 +259,19 @@ func (s *ManagerTestSuite) TestUpdateRunnersLogsErrorFromWatchAllocation() {
|
||||
})
|
||||
})
|
||||
|
||||
go s.nomadRunnerManager.keepRunnersSynced(s.TestCtx)
|
||||
go func() {
|
||||
err := s.nomadRunnerManager.SynchronizeRunners(s.TestCtx)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("failed to synchronize runners")
|
||||
}
|
||||
}()
|
||||
<-time.After(10 * time.Millisecond)
|
||||
|
||||
s.Require().Equal(1, len(hook.Entries))
|
||||
s.Equal(logrus.ErrorLevel, hook.LastEntry().Level)
|
||||
s.Equal(hook.LastEntry().Data[logrus.ErrorKey], tests.ErrDefault)
|
||||
err, ok := hook.LastEntry().Data[logrus.ErrorKey].(error)
|
||||
s.Require().True(ok)
|
||||
s.ErrorIs(err, tests.ErrDefault)
|
||||
}
|
||||
|
||||
func (s *ManagerTestSuite) TestUpdateRunnersAddsIdleRunner() {
|
||||
@@ -285,7 +293,12 @@ func (s *ManagerTestSuite) TestUpdateRunnersAddsIdleRunner() {
|
||||
})
|
||||
})
|
||||
|
||||
go s.nomadRunnerManager.keepRunnersSynced(s.TestCtx)
|
||||
go func() {
|
||||
err := s.nomadRunnerManager.SynchronizeRunners(s.TestCtx)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("failed to synchronize runners")
|
||||
}
|
||||
}()
|
||||
<-time.After(10 * time.Millisecond)
|
||||
|
||||
r, ok := environment.Sample()
|
||||
@@ -313,7 +326,12 @@ func (s *ManagerTestSuite) TestUpdateRunnersRemovesIdleAndUsedRunner() {
|
||||
})
|
||||
})
|
||||
|
||||
go s.nomadRunnerManager.keepRunnersSynced(s.TestCtx)
|
||||
go func() {
|
||||
err := s.nomadRunnerManager.SynchronizeRunners(s.TestCtx)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("failed to synchronize runners")
|
||||
}
|
||||
}()
|
||||
<-time.After(tests.ShortTimeout)
|
||||
|
||||
_, ok = environment.Sample()
|
||||
@@ -515,7 +533,8 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
|
||||
s.ExpectedGoroutingIncrease++ // We dont care about destroying the created runner.
|
||||
call.Return([]*nomadApi.Job{job}, nil)
|
||||
|
||||
runnerManager.Load()
|
||||
err := runnerManager.load()
|
||||
s.NoError(err)
|
||||
|
||||
environmentMock.AssertExpectations(s.T())
|
||||
})
|
||||
@@ -533,7 +552,8 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
|
||||
|
||||
s.Require().Zero(runnerManager.usedRunners.Length())
|
||||
|
||||
runnerManager.Load()
|
||||
err := runnerManager.load()
|
||||
s.NoError(err)
|
||||
|
||||
_, ok := runnerManager.usedRunners.Get(tests.DefaultRunnerID)
|
||||
s.True(ok)
|
||||
@@ -557,7 +577,8 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
|
||||
|
||||
s.Require().Zero(runnerManager.usedRunners.Length())
|
||||
|
||||
runnerManager.Load()
|
||||
err := runnerManager.load()
|
||||
s.NoError(err)
|
||||
|
||||
s.Require().NotZero(runnerManager.usedRunners.Length())
|
||||
|
||||
|
@@ -41,9 +41,12 @@ var (
|
||||
ErrorUnknownExecution = errors.New("unknown execution")
|
||||
ErrorFileCopyFailed = errors.New("file copy failed")
|
||||
ErrFileNotFound = errors.New("file not found or insufficient permissions")
|
||||
ErrLocalDestruction DestroyReason = nomad.ErrorLocalDestruction
|
||||
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
|
||||
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
|
||||
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
|
||||
ErrDestroyedAndReplaced DestroyReason = fmt.Errorf("the runner will be destroyed and replaced: %w", ErrLocalDestruction)
|
||||
ErrEnvironmentUpdated DestroyReason = errors.New("the environment will be destroyed and updated")
|
||||
)
|
||||
|
||||
// NomadJob is an abstraction to communicate with Nomad environments.
|
||||
@@ -258,10 +261,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
|
||||
}
|
||||
}
|
||||
|
||||
// local determines if a reason is present that the runner should only be removed locally (without requesting Nomad).
|
||||
local := errors.Is(reason, nomad.ErrorAllocationRescheduled) ||
|
||||
errors.Is(reason, ErrOOMKilled)
|
||||
if local {
|
||||
if errors.Is(reason, ErrLocalDestruction) {
|
||||
log.WithContext(r.ctx).Debug("Runner destroyed locally")
|
||||
return nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user