From 73759f8a3cec9f32c20723c4553ec860ebab8f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Pa=C3=9F?= <22845248+mpass99@users.noreply.github.com> Date: Wed, 16 Aug 2023 12:47:32 +0200 Subject: [PATCH] Retry Environment Recovery --- deploy/codeocean-terraform | 2 +- internal/environment/nomad_environment.go | 2 +- internal/environment/nomad_manager.go | 3 ++- internal/runner/nomad_manager.go | 2 +- internal/runner/nomad_runner.go | 2 +- pkg/util/util.go | 8 +++++++- 6 files changed, 13 insertions(+), 6 deletions(-) diff --git a/deploy/codeocean-terraform b/deploy/codeocean-terraform index 38ffc3e..77e99a5 160000 --- a/deploy/codeocean-terraform +++ b/deploy/codeocean-terraform @@ -1 +1 @@ -Subproject commit 38ffc3e9247abddf55191bf951569f6e79cd433d +Subproject commit 77e99a52e04a33be897d058b265080d7bbd5ea42 diff --git a/internal/environment/nomad_environment.go b/internal/environment/nomad_environment.go index 4da8b9b..0d40781 100644 --- a/internal/environment/nomad_environment.go +++ b/internal/environment/nomad_environment.go @@ -245,7 +245,7 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) { r, ok := n.idleRunners.Sample() if ok && n.idleRunners.Length() < n.PrewarmingPoolSize() { go func() { - err := util.RetryExponential(time.Second, func() error { + err := util.RetryExponential(func() error { return n.createRunner(false) }) if err != nil { diff --git a/internal/environment/nomad_manager.go b/internal/environment/nomad_manager.go index 7dccad9..7874e68 100644 --- a/internal/environment/nomad_manager.go +++ b/internal/environment/nomad_manager.go @@ -12,6 +12,7 @@ import ( "github.com/openHPI/poseidon/pkg/logging" "github.com/openHPI/poseidon/pkg/monitoring" "github.com/openHPI/poseidon/pkg/storage" + "github.com/openHPI/poseidon/pkg/util" "os" "time" ) @@ -42,7 +43,7 @@ func NewNomadEnvironmentManager( m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager}, apiClient, templateEnvironmentJobHCL} - if err := m.Load(); err != nil { + if err := util.RetryExponential(func() error { return m.Load() }); err != nil { log.WithError(err).Error("Error recovering the execution environments") } runnerManager.Load() diff --git a/internal/runner/nomad_manager.go b/internal/runner/nomad_manager.go index 4deda3f..e6ede1c 100644 --- a/internal/runner/nomad_manager.go +++ b/internal/runner/nomad_manager.go @@ -55,7 +55,7 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int } func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) { - err := util.RetryExponential(time.Second, func() (err error) { + err := util.RetryExponential(func() (err error) { if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil { err = fmt.Errorf("cannot mark runner as used: %w", err) } diff --git a/internal/runner/nomad_runner.go b/internal/runner/nomad_runner.go index e5d0eb6..5573693 100644 --- a/internal/runner/nomad_runner.go +++ b/internal/runner/nomad_runner.go @@ -244,7 +244,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) { } if err == nil && !errors.Is(reason, ErrOOMKilled) { - err = util.RetryExponential(time.Second, func() (err error) { + err = util.RetryExponential(func() (err error) { if err = r.api.DeleteJob(r.ID()); err != nil { err = fmt.Errorf("error deleting runner in Nomad: %w", err) } diff --git a/pkg/util/util.go b/pkg/util/util.go index 08e57e5..a27ed7b 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -9,6 +9,8 @@ var ( log = logging.GetLogger("util") // MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons. MaxConnectionRetriesExponential = 18 + // InitialWaitingDuration is the default initial duration of waiting after a failed time. + InitialWaitingDuration = time.Second ) // RetryExponentialAttempts executes the passed function @@ -28,6 +30,10 @@ func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error) return err } -func RetryExponential(sleep time.Duration, f func() error) error { +func RetryExponentialDuration(sleep time.Duration, f func() error) error { return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f) } + +func RetryExponential(f func() error) error { + return RetryExponentialDuration(InitialWaitingDuration, f) +}