Extract the WatchEventStream retry mechanism

into the utils including all other retry mechanisms.

With this change we fix that the WatchEventStream goroutine does not stop directly when the context is done (but previously only one second after).
This commit is contained in:
Maximilian Paß
2023-09-05 16:10:48 +02:00
parent 0d6b4f660c
commit e3161637a9
4 changed files with 67 additions and 28 deletions

View File

@ -245,7 +245,7 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
r, ok := n.idleRunners.Sample()
if ok && n.idleRunners.Length() < n.PrewarmingPoolSize() {
go func() {
err := util.RetryExponentialContext(n.ctx, func() error { return n.createRunner(false) })
err := util.RetryExponentialWithContext(n.ctx, func() error { return n.createRunner(false) })
if err != nil {
log.WithError(err).WithField(dto.KeyEnvironmentID, n.ID().ToString()).
Error("Couldn't create new runner for claimed one")

View File

@ -44,7 +44,7 @@ func NewNomadEnvironmentManager(
m := &NomadEnvironmentManager{&AbstractManager{nil, runnerManager},
apiClient, templateEnvironmentJobHCL}
if err := util.RetryExponentialContext(ctx, func() error { return m.Load() }); err != nil {
if err := util.RetryExponentialWithContext(ctx, func() error { return m.Load() }); err != nil {
log.WithError(err).Error("Error recovering the execution environments")
}
runnerManager.Load()

View File

@ -12,6 +12,7 @@ import (
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/util"
"github.com/sirupsen/logrus"
"math"
"strconv"
"time"
)
@ -125,13 +126,17 @@ func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger
}
func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
retries := 0
for ctx.Err() == nil {
err := util.RetryConstantAttemptsWithContext(math.MaxInt, ctx, func() error {
err := m.apiClient.WatchEventStream(ctx,
&nomad.AllocationProcessing{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped})
retries += 1
log.WithContext(ctx).WithError(err).WithField("count", retries).Errorf("Nomad Event Stream failed! Retrying...")
<-time.After(time.Second)
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
log.WithContext(ctx).WithError(err).Errorf("Nomad Event Stream failed! Retrying...")
err = fmt.Errorf("KeepRunnersSynced: %w", err)
}
return err
})
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
log.WithContext(ctx).WithError(err).Fatal("Stopped Restarting the Nomad Event Stream")
}
}