Add retry-mechanism for sample, mark-as-used and return
of Nomad runners.
This commit is contained in:
@ -13,14 +13,13 @@ import (
|
|||||||
"github.com/openHPI/poseidon/pkg/dto"
|
"github.com/openHPI/poseidon/pkg/dto"
|
||||||
"github.com/openHPI/poseidon/pkg/monitoring"
|
"github.com/openHPI/poseidon/pkg/monitoring"
|
||||||
"github.com/openHPI/poseidon/pkg/storage"
|
"github.com/openHPI/poseidon/pkg/storage"
|
||||||
|
"github.com/openHPI/poseidon/pkg/util"
|
||||||
"strconv"
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const portNumberBase = 10
|
||||||
portNumberBase = 10
|
|
||||||
)
|
|
||||||
|
|
||||||
var ErrScaleDown = errors.New("cannot scale down the environment")
|
var ErrScaleDown = errors.New("cannot scale down the environment")
|
||||||
|
|
||||||
@ -246,7 +245,9 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
|
|||||||
r, ok := n.idleRunners.Sample()
|
r, ok := n.idleRunners.Sample()
|
||||||
if ok {
|
if ok {
|
||||||
go func() {
|
go func() {
|
||||||
err := n.createRunner(false)
|
err := util.RetryExponential(time.Second, func() error {
|
||||||
|
return n.createRunner(false)
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).WithField("environmentID", n.ID()).Error("Couldn't create new runner for claimed one")
|
log.WithError(err).WithField("environmentID", n.ID()).Error("Couldn't create new runner for claimed one")
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import (
|
|||||||
"github.com/openHPI/poseidon/pkg/dto"
|
"github.com/openHPI/poseidon/pkg/dto"
|
||||||
"github.com/openHPI/poseidon/pkg/logging"
|
"github.com/openHPI/poseidon/pkg/logging"
|
||||||
"github.com/openHPI/poseidon/pkg/monitoring"
|
"github.com/openHPI/poseidon/pkg/monitoring"
|
||||||
|
"github.com/openHPI/poseidon/pkg/util"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
@ -54,9 +55,14 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
|
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
|
||||||
err := m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration)
|
err := util.RetryExponential(time.Second, func() (err error) {
|
||||||
|
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
|
||||||
|
err = fmt.Errorf("cannot mark runner as used: %w", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err = m.Return(runner)
|
err := m.Return(runner)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).WithField("runnerID", runner.ID()).Error("can't mark runner as used and can't return runner")
|
log.WithError(err).WithField("runnerID", runner.ID()).Error("can't mark runner as used and can't return runner")
|
||||||
}
|
}
|
||||||
@ -65,9 +71,14 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int
|
|||||||
|
|
||||||
func (m *NomadRunnerManager) Return(r Runner) error {
|
func (m *NomadRunnerManager) Return(r Runner) error {
|
||||||
r.StopTimeout()
|
r.StopTimeout()
|
||||||
err := m.apiClient.DeleteJob(r.ID())
|
err := util.RetryExponential(time.Second, func() (err error) {
|
||||||
|
if err = m.apiClient.DeleteJob(r.ID()); err != nil {
|
||||||
|
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("error deleting runner in Nomad: %w", err)
|
return fmt.Errorf("%w", err)
|
||||||
}
|
}
|
||||||
m.usedRunners.Delete(r.ID())
|
m.usedRunners.Delete(r.ID())
|
||||||
return nil
|
return nil
|
||||||
|
@ -6,6 +6,7 @@ import (
|
|||||||
"github.com/openHPI/poseidon/internal/nomad"
|
"github.com/openHPI/poseidon/internal/nomad"
|
||||||
"github.com/openHPI/poseidon/pkg/dto"
|
"github.com/openHPI/poseidon/pkg/dto"
|
||||||
"github.com/openHPI/poseidon/pkg/storage"
|
"github.com/openHPI/poseidon/pkg/storage"
|
||||||
|
"github.com/openHPI/poseidon/pkg/util"
|
||||||
"github.com/openHPI/poseidon/tests"
|
"github.com/openHPI/poseidon/tests"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"github.com/sirupsen/logrus/hooks/test"
|
"github.com/sirupsen/logrus/hooks/test"
|
||||||
@ -146,6 +147,7 @@ func (s *ManagerTestSuite) TestClaimAddsRunnerToUsedRunners() {
|
|||||||
func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
|
func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
|
||||||
s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true)
|
s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true)
|
||||||
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
|
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
|
||||||
|
util.MaxConnectionRetriesExponential = 1
|
||||||
modifyMockedCall(s.apiMock, "MarkRunnerAsUsed", func(call *mock.Call) {
|
modifyMockedCall(s.apiMock, "MarkRunnerAsUsed", func(call *mock.Call) {
|
||||||
call.Run(func(args mock.Arguments) {
|
call.Run(func(args mock.Arguments) {
|
||||||
call.ReturnArguments = mock.Arguments{tests.ErrDefault}
|
call.ReturnArguments = mock.Arguments{tests.ErrDefault}
|
||||||
@ -154,7 +156,7 @@ func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
|
|||||||
|
|
||||||
claimedRunner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, defaultInactivityTimeout)
|
claimedRunner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, defaultInactivityTimeout)
|
||||||
s.Require().NoError(err)
|
s.Require().NoError(err)
|
||||||
<-time.After(tests.ShortTimeout) // Claimed runners are marked as used asynchronously
|
<-time.After(time.Second + tests.ShortTimeout) // Claimed runners are marked as used asynchronously
|
||||||
s.apiMock.AssertCalled(s.T(), "DeleteJob", claimedRunner.ID())
|
s.apiMock.AssertCalled(s.T(), "DeleteJob", claimedRunner.ID())
|
||||||
_, ok := s.nomadRunnerManager.usedRunners.Get(claimedRunner.ID())
|
_, ok := s.nomadRunnerManager.usedRunners.Get(claimedRunner.ID())
|
||||||
s.False(ok)
|
s.False(ok)
|
||||||
|
33
pkg/util/util.go
Normal file
33
pkg/util/util.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package util
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/openHPI/poseidon/pkg/logging"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
log = logging.GetLogger("util")
|
||||||
|
// MaxConnectionRetriesExponential is the default number of retries. It's exported for testing reasons.
|
||||||
|
MaxConnectionRetriesExponential = 18
|
||||||
|
)
|
||||||
|
|
||||||
|
// RetryExponentialAttempts executes the passed function
|
||||||
|
// with exponentially increasing time in between starting at the passed sleep duration
|
||||||
|
// up to a maximum of attempts tries.
|
||||||
|
func RetryExponentialAttempts(attempts int, sleep time.Duration, f func() error) (err error) {
|
||||||
|
for i := 0; i < attempts; i++ {
|
||||||
|
err = f()
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
log.WithField("count", i).WithError(err).Debug("retrying after error")
|
||||||
|
time.Sleep(sleep)
|
||||||
|
sleep *= 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func RetryExponential(sleep time.Duration, f func() error) error {
|
||||||
|
return RetryExponentialAttempts(MaxConnectionRetriesExponential, sleep, f)
|
||||||
|
}
|
Reference in New Issue
Block a user