Retry Environment Recovery
This commit is contained in:

committed by
Sebastian Serth

parent
89c18ad45c
commit
73759f8a3c
Submodule deploy/codeocean-terraform updated: 38ffc3e924...77e99a52e0
@ -245,7 +245,7 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
|
|||||||
r, ok := n.idleRunners.Sample()
|
r, ok := n.idleRunners.Sample()
|
||||||
if ok && n.idleRunners.Length() < n.PrewarmingPoolSize() {
|
if ok && n.idleRunners.Length() < n.PrewarmingPoolSize() {
|
||||||
go func() {
|
go func() {
|
||||||
err := util.RetryExponential(time.Second, func() error {
|
err := util.RetryExponential(func() error {
|
||||||
return n.createRunner(false)
|
return n.createRunner(false)
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -12,6 +12,7 @@ import (
|
|||||||
"github.com/openHPI/poseidon/pkg/logging"
|
"github.com/openHPI/poseidon/pkg/logging"
|
||||||
"github.com/openHPI/poseidon/pkg/monitoring"
|
"github.com/openHPI/poseidon/pkg/monitoring"
|
||||||
"github.com/openHPI/poseidon/pkg/storage"
|
"github.com/openHPI/poseidon/pkg/storage"
|
||||||
|
"github.com/openHPI/poseidon/pkg/util"
|
||||||
"os"
|
"os"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -42,7 +43,7 @@ func NewNomadEnvironmentManager(
|
|||||||
|
|
||||||
m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager},
|
m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager},
|
||||||
apiClient, templateEnvironmentJobHCL}
|
apiClient, templateEnvironmentJobHCL}
|
||||||
if err := m.Load(); err != nil {
|
if err := util.RetryExponential(func() error { return m.Load() }); err != nil {
|
||||||
log.WithError(err).Error("Error recovering the execution environments")
|
log.WithError(err).Error("Error recovering the execution environments")
|
||||||
}
|
}
|
||||||
runnerManager.Load()
|
runnerManager.Load()
|
||||||
|
@ -55,7 +55,7 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
|
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
|
||||||
err := util.RetryExponential(time.Second, func() (err error) {
|
err := util.RetryExponential(func() (err error) {
|
||||||
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
|
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
|
||||||
err = fmt.Errorf("cannot mark runner as used: %w", err)
|
err = fmt.Errorf("cannot mark runner as used: %w", err)
|
||||||
}
|
}
|
||||||
|
@ -244,7 +244,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if err == nil && !errors.Is(reason, ErrOOMKilled) {
|
if err == nil && !errors.Is(reason, ErrOOMKilled) {
|
||||||
err = util.RetryExponential(time.Second, func() (err error) {
|
err = util.RetryExponential(func() (err error) {
|
||||||
if err = r.api.DeleteJob(r.ID()); err != nil {
|
if err = r.api.DeleteJob(r.ID()); err != nil {
|
||||||
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
|
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,8 @@ var (
|
|||||||
log = logging.GetLogger("util")
|
log = logging.GetLogger("util")
|
||||||
// MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons.
|
// MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons.
|
||||||
MaxConnectionRetriesExponential = 18
|
MaxConnectionRetriesExponential = 18
|
||||||
|
// InitialWaitingDuration is the default initial duration of waiting after a failed time.
|
||||||
|
InitialWaitingDuration = time.Second
|
||||||
)
|
)
|
||||||
|
|
||||||
// RetryExponentialAttempts executes the passed function
|
// RetryExponentialAttempts executes the passed function
|
||||||
@ -28,6 +30,10 @@ func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error)
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func RetryExponential(sleep time.Duration, f func() error) error {
|
func RetryExponentialDuration(sleep time.Duration, f func() error) error {
|
||||||
return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f)
|
return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func RetryExponential(f func() error) error {
|
||||||
|
return RetryExponentialDuration(InitialWaitingDuration, f)
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user