diff --git a/internal/environment/nomad_environment.go b/internal/environment/nomad_environment.go index ed3d7b3..c27b232 100644 --- a/internal/environment/nomad_environment.go +++ b/internal/environment/nomad_environment.go @@ -13,14 +13,13 @@ import ( "github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/monitoring" "github.com/openHPI/poseidon/pkg/storage" + "github.com/openHPI/poseidon/pkg/util" "strconv" "sync" "time" ) -const ( - portNumberBase = 10 -) +const portNumberBase = 10 var ErrScaleDown = errors.New("cannot scale down the environment") @@ -246,7 +245,9 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) { r, ok := n.idleRunners.Sample() if ok { go func() { - err := n.createRunner(false) + err := util.RetryExponential(time.Second, func() error { + return n.createRunner(false) + }) if err != nil { log.WithError(err).WithField("environmentID", n.ID()).Error("Couldn't create new runner for claimed one") } diff --git a/internal/runner/nomad_manager.go b/internal/runner/nomad_manager.go index 7ffc23f..acdd83a 100644 --- a/internal/runner/nomad_manager.go +++ b/internal/runner/nomad_manager.go @@ -10,6 +10,7 @@ import ( "github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/logging" "github.com/openHPI/poseidon/pkg/monitoring" + "github.com/openHPI/poseidon/pkg/util" "github.com/sirupsen/logrus" "strconv" "time" @@ -54,9 +55,14 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int } func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) { - err := m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration) + err := util.RetryExponential(time.Second, func() (err error) { + if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil { + err = fmt.Errorf("cannot mark runner as used: %w", err) + } + return + }) if err != nil { - err = m.Return(runner) + err := m.Return(runner) if err != nil { log.WithError(err).WithField("runnerID", runner.ID()).Error("can't mark runner as used and can't return runner") } @@ -65,9 +71,14 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int func (m *NomadRunnerManager) Return(r Runner) error { r.StopTimeout() - err := m.apiClient.DeleteJob(r.ID()) + err := util.RetryExponential(time.Second, func() (err error) { + if err = m.apiClient.DeleteJob(r.ID()); err != nil { + err = fmt.Errorf("error deleting runner in Nomad: %w", err) + } + return + }) if err != nil { - return fmt.Errorf("error deleting runner in Nomad: %w", err) + return fmt.Errorf("%w", err) } m.usedRunners.Delete(r.ID()) return nil diff --git a/internal/runner/nomad_manager_test.go b/internal/runner/nomad_manager_test.go index 49158e0..4fe3c63 100644 --- a/internal/runner/nomad_manager_test.go +++ b/internal/runner/nomad_manager_test.go @@ -6,6 +6,7 @@ import ( "github.com/openHPI/poseidon/internal/nomad" "github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/storage" + "github.com/openHPI/poseidon/pkg/util" "github.com/openHPI/poseidon/tests" "github.com/sirupsen/logrus" "github.com/sirupsen/logrus/hooks/test" @@ -146,6 +147,7 @@ func (s *ManagerTestSuite) TestClaimAddsRunnerToUsedRunners() { func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() { s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true) s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil) + util.MaxConnectionRetriesExponential = 1 modifyMockedCall(s.apiMock, "MarkRunnerAsUsed", func(call *mock.Call) { call.Run(func(args mock.Arguments) { call.ReturnArguments = mock.Arguments{tests.ErrDefault} @@ -154,7 +156,7 @@ func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() { claimedRunner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, defaultInactivityTimeout) s.Require().NoError(err) - <-time.After(tests.ShortTimeout) // Claimed runners are marked as used asynchronously + <-time.After(time.Second + tests.ShortTimeout) // Claimed runners are marked as used asynchronously s.apiMock.AssertCalled(s.T(), "DeleteJob", claimedRunner.ID()) _, ok := s.nomadRunnerManager.usedRunners.Get(claimedRunner.ID()) s.False(ok) diff --git a/pkg/util/util.go b/pkg/util/util.go new file mode 100644 index 0000000..ed3f736 --- /dev/null +++ b/pkg/util/util.go @@ -0,0 +1,33 @@ +package util + +import ( + "github.com/openHPI/poseidon/pkg/logging" + "time" +) + +var ( + log = logging.GetLogger("util") + // MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons. + MaxConnectionRetriesExponential = 18 +) + +// RetryExponentialAttempts executes the passed function +// with exponentially increasing time in between starting at the passed sleep duration +// up to a maximum of attempts tries. +func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error) (err error) { + for i := 0; i < attempts; i++ { + err = f() + if err == nil { + return + } else { + log.WithField("count", i).WithError(err).Debug("retrying after error") + time.Sleep(sleep) + sleep *= 2 + } + } + return err +} + +func RetryExponential(sleep time.Duration, f func() error) error { + return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f) +}