Add retry-mechanism for sample, mark-as-used and return

of Nomad runners.
This commit is contained in:
Maximilian Paß
2022-10-14 21:29:23 +01:00
parent b9c923da8a
commit 160df3d9e6
4 changed files with 56 additions and 9 deletions

View File

@ -13,14 +13,13 @@ import (
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util"
"strconv"
"sync"
"time"
)
const (
portNumberBase = 10
)
const portNumberBase = 10
var ErrScaleDown = errors.New("cannot scale down the environment")
@ -246,7 +245,9 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
r, ok := n.idleRunners.Sample()
if ok {
go func() {
err := n.createRunner(false)
err := util.RetryExponential(time.Second, func() error {
return n.createRunner(false)
})
if err != nil {
log.WithError(err).WithField("environmentID", n.ID()).Error("Couldn't create new runner for claimed one")
}

View File

@ -10,6 +10,7 @@ import (
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/logging"
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/util"
"github.com/sirupsen/logrus"
"strconv"
"time"
@ -54,9 +55,14 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int
}
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
err := m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration)
err := util.RetryExponential(time.Second, func() (err error) {
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
err = fmt.Errorf("cannot mark runner as used: %w", err)
}
return
})
if err != nil {
err = m.Return(runner)
err := m.Return(runner)
if err != nil {
log.WithError(err).WithField("runnerID", runner.ID()).Error("can't mark runner as used and can't return runner")
}
@ -65,9 +71,14 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int
func (m *NomadRunnerManager) Return(r Runner) error {
r.StopTimeout()
err := m.apiClient.DeleteJob(r.ID())
err := util.RetryExponential(time.Second, func() (err error) {
if err = m.apiClient.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
}
return
})
if err != nil {
return fmt.Errorf("error deleting runner in Nomad: %w", err)
return fmt.Errorf("%w", err)
}
m.usedRunners.Delete(r.ID())
return nil

View File

@ -6,6 +6,7 @@ import (
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util"
"github.com/openHPI/poseidon/tests"
"github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
@ -146,6 +147,7 @@ func (s *ManagerTestSuite) TestClaimAddsRunnerToUsedRunners() {
func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true)
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
util.MaxConnectionRetriesExponential = 1
modifyMockedCall(s.apiMock, "MarkRunnerAsUsed", func(call *mock.Call) {
call.Run(func(args mock.Arguments) {
call.ReturnArguments = mock.Arguments{tests.ErrDefault}
@ -154,7 +156,7 @@ func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
claimedRunner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, defaultInactivityTimeout)
s.Require().NoError(err)
<-time.After(tests.ShortTimeout) // Claimed runners are marked as used asynchronously
<-time.After(time.Second + tests.ShortTimeout) // Claimed runners are marked as used asynchronously
s.apiMock.AssertCalled(s.T(), "DeleteJob", claimedRunner.ID())
_, ok := s.nomadRunnerManager.usedRunners.Get(claimedRunner.ID())
s.False(ok)

33
pkg/util/util.go Normal file
View File

@ -0,0 +1,33 @@
package util
import (
"github.com/openHPI/poseidon/pkg/logging"
"time"
)
var (
log = logging.GetLogger("util")
// MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons.
MaxConnectionRetriesExponential = 18
)
// RetryExponentialAttempts executes the passed function
// with exponentially increasing time in between starting at the passed sleep duration
// up to a maximum of attempts tries.
func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error) (err error) {
for i := 0; i < attempts; i++ {
err = f()
if err == nil {
return
} else {
log.WithField("count", i).WithError(err).Debug("retrying after error")
time.Sleep(sleep)
sleep *= 2
}
}
return err
}
func RetryExponential(sleep time.Duration, f func() error) error {
return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f)
}