Add retry-mechanism for sample, mark-as-used and return

of Nomad runners.
2022-10-14 21:29:23 +01:00
parent b9c923da8a
commit 160df3d9e6
4 changed files with 56 additions and 9 deletions
--- a/internal/environment/nomad_environment.go
+++ b/internal/environment/nomad_environment.go
@ -13,14 +13,13 @@ import (
 	"github.com/openHPI/poseidon/pkg/dto"
 	"github.com/openHPI/poseidon/pkg/monitoring"
 	"github.com/openHPI/poseidon/pkg/storage"
+	"github.com/openHPI/poseidon/pkg/util"
 	"strconv"
 	"sync"
 	"time"
 )

-const (
-	portNumberBase = 10
-)
+const portNumberBase = 10

 var ErrScaleDown = errors.New("cannot scale down the environment")

@ -246,7 +245,9 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
 	r, ok := n.idleRunners.Sample()
 	if ok {
 		go func() {
-			err := n.createRunner(false)
+			err := util.RetryExponential(time.Second, func() error {
+				return n.createRunner(false)
+			})
 			if err != nil {
 				log.WithError(err).WithField("environmentID", n.ID()).Error("Couldn't create new runner for claimed one")
 			}
--- a/internal/runner/nomad_manager.go
+++ b/internal/runner/nomad_manager.go
@ -10,6 +10,7 @@ import (
 	"github.com/openHPI/poseidon/pkg/dto"
 	"github.com/openHPI/poseidon/pkg/logging"
 	"github.com/openHPI/poseidon/pkg/monitoring"
+	"github.com/openHPI/poseidon/pkg/util"
 	"github.com/sirupsen/logrus"
 	"strconv"
 	"time"
@ -54,9 +55,14 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int
 }

 func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
-	err := m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration)
+	err := util.RetryExponential(time.Second, func() (err error) {
+		if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
+			err = fmt.Errorf("cannot mark runner as used: %w", err)
+		}
+		return
+	})
 	if err != nil {
-		err = m.Return(runner)
+		err := m.Return(runner)
 		if err != nil {
 			log.WithError(err).WithField("runnerID", runner.ID()).Error("can't mark runner as used and can't return runner")
 		}
@ -65,9 +71,14 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int

 func (m *NomadRunnerManager) Return(r Runner) error {
 	r.StopTimeout()
-	err := m.apiClient.DeleteJob(r.ID())
+	err := util.RetryExponential(time.Second, func() (err error) {
+		if err = m.apiClient.DeleteJob(r.ID()); err != nil {
+			err = fmt.Errorf("error deleting runner in Nomad: %w", err)
+		}
+		return
+	})
 	if err != nil {
-		return fmt.Errorf("error deleting runner in Nomad: %w", err)
+		return fmt.Errorf("%w", err)
 	}
 	m.usedRunners.Delete(r.ID())
 	return nil
--- a/internal/runner/nomad_manager_test.go
+++ b/internal/runner/nomad_manager_test.go
@ -6,6 +6,7 @@ import (
 	"github.com/openHPI/poseidon/internal/nomad"
 	"github.com/openHPI/poseidon/pkg/dto"
 	"github.com/openHPI/poseidon/pkg/storage"
+	"github.com/openHPI/poseidon/pkg/util"
 	"github.com/openHPI/poseidon/tests"
 	"github.com/sirupsen/logrus"
 	"github.com/sirupsen/logrus/hooks/test"
@ -146,6 +147,7 @@ func (s *ManagerTestSuite) TestClaimAddsRunnerToUsedRunners() {
 func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
 	s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true)
 	s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
+	util.MaxConnectionRetriesExponential = 1
 	modifyMockedCall(s.apiMock, "MarkRunnerAsUsed", func(call *mock.Call) {
 		call.Run(func(args mock.Arguments) {
 			call.ReturnArguments = mock.Arguments{tests.ErrDefault}
@ -154,7 +156,7 @@ func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {

 	claimedRunner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, defaultInactivityTimeout)
 	s.Require().NoError(err)
-	<-time.After(tests.ShortTimeout) // Claimed runners are marked as used asynchronously
+	<-time.After(time.Second + tests.ShortTimeout) // Claimed runners are marked as used asynchronously
 	s.apiMock.AssertCalled(s.T(), "DeleteJob", claimedRunner.ID())
 	_, ok := s.nomadRunnerManager.usedRunners.Get(claimedRunner.ID())
 	s.False(ok)
--- a/pkg/util/util.go
+++ b/pkg/util/util.go
@ -0,0 +1,33 @@
+package util
+
+import (
+	"github.com/openHPI/poseidon/pkg/logging"
+	"time"
+)
+
+var (
+	log = logging.GetLogger("util")
+	// MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons.
+	MaxConnectionRetriesExponential = 18
+)
+
+// RetryExponentialAttempts executes the passed function
+// with exponentially increasing time in between starting at the passed sleep duration
+// up to a maximum of attempts tries.
+func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error) (err error) {
+	for i := 0; i < attempts; i++ {
+		err = f()
+		if err == nil {
+			return
+		} else {
+			log.WithField("count", i).WithError(err).Debug("retrying after error")
+			time.Sleep(sleep)
+			sleep *= 2
+		}
+	}
+	return err
+}
+
+func RetryExponential(sleep time.Duration, f func() error) error {
+	return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f)
+}