Retry Environment Recovery

This commit is contained in:
Maximilian Paß
2023-08-16 12:47:32 +02:00
committed by Sebastian Serth
parent 89c18ad45c
commit 73759f8a3c
6 changed files with 13 additions and 6 deletions

View File

@ -245,7 +245,7 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
r, ok := n.idleRunners.Sample() r, ok := n.idleRunners.Sample()
if ok && n.idleRunners.Length() < n.PrewarmingPoolSize() { if ok && n.idleRunners.Length() < n.PrewarmingPoolSize() {
go func() { go func() {
err := util.RetryExponential(time.Second, func() error { err := util.RetryExponential(func() error {
return n.createRunner(false) return n.createRunner(false)
}) })
if err != nil { if err != nil {

View File

@ -12,6 +12,7 @@ import (
"github.com/openHPI/poseidon/pkg/logging" "github.com/openHPI/poseidon/pkg/logging"
"github.com/openHPI/poseidon/pkg/monitoring" "github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage" "github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util"
"os" "os"
"time" "time"
) )
@ -42,7 +43,7 @@ func NewNomadEnvironmentManager(
m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager}, m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager},
apiClient, templateEnvironmentJobHCL} apiClient, templateEnvironmentJobHCL}
if err := m.Load(); err != nil { if err := util.RetryExponential(func() error { return m.Load() }); err != nil {
log.WithError(err).Error("Error recovering the execution environments") log.WithError(err).Error("Error recovering the execution environments")
} }
runnerManager.Load() runnerManager.Load()

View File

@ -55,7 +55,7 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int
} }
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) { func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
err := util.RetryExponential(time.Second, func() (err error) { err := util.RetryExponential(func() (err error) {
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil { if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
err = fmt.Errorf("cannot mark runner as used: %w", err) err = fmt.Errorf("cannot mark runner as used: %w", err)
} }

View File

@ -244,7 +244,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
} }
if err == nil && !errors.Is(reason, ErrOOMKilled) { if err == nil && !errors.Is(reason, ErrOOMKilled) {
err = util.RetryExponential(time.Second, func() (err error) { err = util.RetryExponential(func() (err error) {
if err = r.api.DeleteJob(r.ID()); err != nil { if err = r.api.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err) err = fmt.Errorf("error deleting runner in Nomad: %w", err)
} }

View File

@ -9,6 +9,8 @@ var (
log = logging.GetLogger("util") log = logging.GetLogger("util")
// MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons. // MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons.
MaxConnectionRetriesExponential = 18 MaxConnectionRetriesExponential = 18
// InitialWaitingDuration is the default initial duration of waiting after a failed time.
InitialWaitingDuration = time.Second
) )
// RetryExponentialAttempts executes the passed function // RetryExponentialAttempts executes the passed function
@ -28,6 +30,10 @@ func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error)
return err return err
} }
func RetryExponential(sleep time.Duration, f func() error) error { func RetryExponentialDuration(sleep time.Duration, f func() error) error {
return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f) return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f)
} }
func RetryExponential(f func() error) error {
return RetryExponentialDuration(InitialWaitingDuration, f)
}