Remove usage of context.DeadlineExceeded

for internal decisions as this error is strongly used by other packages. By checking such wrapped errors the internal decision can be influenced accidentally.
In this case the retry mechanism checked if the error is context.DeadlineExceeded and assumed it would be created by the internal context. This assumption was wrong.
This commit is contained in:
Maximilian Paß
2023-10-30 14:51:31 +01:00
committed by Sebastian Serth
parent 6b69a2d732
commit d0dd5c08cb
6 changed files with 22 additions and 12 deletions

View File

@ -95,7 +95,7 @@ func (cw *codeOceanOutputWriter) StdErr() io.Writer {
// Close forwards the kind of exit (timeout, error, normal) to CodeOcean. // Close forwards the kind of exit (timeout, error, normal) to CodeOcean.
// This results in the closing of the WebSocket connection. // This results in the closing of the WebSocket connection.
// The call of Close is mandantory! // The call of Close is mandatory!
func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) { func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
defer func() { cw.queue <- &writingLoopMessage{done: true} }() defer func() { cw.queue <- &writingLoopMessage{done: true} }()
// Mask the internal stop reason before disclosing/forwarding it externally/to CodeOcean. // Mask the internal stop reason before disclosing/forwarding it externally/to CodeOcean.
@ -104,7 +104,7 @@ func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
return return
case info.Err == nil: case info.Err == nil:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code}) cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout): case errors.Is(info.Err, runner.ErrorExecutionTimeout) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout}) cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout})
case errors.Is(info.Err, runner.ErrOOMKilled): case errors.Is(info.Err, runner.ErrOOMKilled):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: dto.ErrOOMKilled.Error()}) cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: dto.ErrOOMKilled.Error()})

View File

@ -3,7 +3,6 @@ package environment
import ( import (
"context" "context"
_ "embed" _ "embed"
"errors"
"fmt" "fmt"
nomadApi "github.com/hashicorp/nomad/api" nomadApi "github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs"
@ -14,6 +13,7 @@ import (
"github.com/openHPI/poseidon/pkg/monitoring" "github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage" "github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util" "github.com/openHPI/poseidon/pkg/util"
"github.com/sirupsen/logrus"
"math" "math"
"os" "os"
"time" "time"
@ -187,14 +187,17 @@ func (m *NomadEnvironmentManager) KeepEnvironmentsSynced(synchronizeRunners func
return nil return nil
}) })
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) { level := logrus.InfoLevel
log.WithContext(ctx).WithError(err).Fatal("Stopped KeepEnvironmentsSynced") if err != nil && ctx.Err() == nil {
level = logrus.FatalLevel
} }
log.WithContext(ctx).WithError(err).Log(level, "Stopped KeepEnvironmentsSynced")
} }
// Load recovers all environments from the Jobs in Nomad. // Load recovers all environments from the Jobs in Nomad.
// As it replaces the environments the idle runners stored are not tracked anymore. // As it replaces the environments the idle runners stored are not tracked anymore.
func (m *NomadEnvironmentManager) load() error { func (m *NomadEnvironmentManager) load() error {
log.Info("Loading environments")
// We have to destroy the environments first as otherwise they would just be replaced and old goroutines might stay running. // We have to destroy the environments first as otherwise they would just be replaced and old goroutines might stay running.
for _, environment := range m.runnerManager.ListEnvironments() { for _, environment := range m.runnerManager.ListEnvironments() {
err := environment.Delete(runner.ErrDestroyedAndReplaced) err := environment.Delete(runner.ErrDestroyedAndReplaced)

View File

@ -31,7 +31,10 @@ const (
TimerExpired TimerState = 2 TimerExpired TimerState = 2
) )
var ErrorRunnerInactivityTimeout DestroyReason = errors.New("runner inactivity timeout exceeded") var (
ErrorRunnerInactivityTimeout DestroyReason = errors.New("runner inactivity timeout exceeded")
ErrorExecutionTimeout = errors.New("execution timeout exceeded")
)
type InactivityTimerImplementation struct { type InactivityTimerImplementation struct {
timer *time.Timer timer *time.Timer

View File

@ -79,16 +79,17 @@ func (m *NomadRunnerManager) Return(r Runner) error {
// SynchronizeRunners loads all runners and keeps them synchronized (without a retry mechanism). // SynchronizeRunners loads all runners and keeps them synchronized (without a retry mechanism).
func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error { func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error {
// Load Runners log.Info("Loading runners")
if err := m.load(); err != nil { if err := m.load(); err != nil {
return fmt.Errorf("failed loading runners: %w", err) return fmt.Errorf("failed loading runners: %w", err)
} }
// Watch for changes regarding the existing or new runners. // Watch for changes regarding the existing or new runners.
log.Info("Watching Event Stream")
err := m.apiClient.WatchEventStream(ctx, err := m.apiClient.WatchEventStream(ctx,
&nomad.AllocationProcessing{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped}) &nomad.AllocationProcessing{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped})
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) { if err != nil && ctx.Err() == nil {
err = fmt.Errorf("nomad Event Stream failed!: %w", err) err = fmt.Errorf("nomad Event Stream failed!: %w", err)
} }
return err return err
@ -97,7 +98,6 @@ func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error {
// Load recovers all runners for all existing environments. // Load recovers all runners for all existing environments.
func (m *NomadRunnerManager) load() error { func (m *NomadRunnerManager) load() error {
newUsedRunners := storage.NewLocalStorage[Runner]() newUsedRunners := storage.NewLocalStorage[Runner]()
for _, environment := range m.environments.List() { for _, environment := range m.environments.List() {
environmentLogger := log.WithField(dto.KeyEnvironmentID, environment.ID().ToString()) environmentLogger := log.WithField(dto.KeyEnvironmentID, environment.ID().ToString())

View File

@ -342,6 +342,9 @@ func (r *NomadJob) handleExit(exitInfo ExitInfo, exitInternal <-chan ExitInfo, e
func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- ExitInfo, func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- ExitInfo,
stdin io.ReadWriter, ctx context.Context) { stdin io.ReadWriter, ctx context.Context) {
err := ctx.Err() err := ctx.Err()
if errors.Is(err, context.DeadlineExceeded) {
err = ErrorExecutionTimeout
} // for errors.Is(err, context.Canceled) the user likely disconnected from the execution.
if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok { if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok {
err = reason err = reason
if r.TimeoutPassed() && !errors.Is(err, ErrorRunnerInactivityTimeout) { if r.TimeoutPassed() && !errors.Is(err, ErrorRunnerInactivityTimeout) {

View File

@ -3,7 +3,6 @@ package util
import ( import (
"context" "context"
"errors" "errors"
"fmt"
"github.com/openHPI/poseidon/pkg/logging" "github.com/openHPI/poseidon/pkg/logging"
"time" "time"
) )
@ -14,6 +13,7 @@ var (
MaxConnectionRetriesExponential = 18 MaxConnectionRetriesExponential = 18
// InitialWaitingDuration is the default initial duration of waiting after a failed time. // InitialWaitingDuration is the default initial duration of waiting after a failed time.
InitialWaitingDuration = time.Second InitialWaitingDuration = time.Second
ErrRetryContextDone = errors.New("the retry context is done")
) )
func retryExponential(ctx context.Context, sleep time.Duration, f func() error) func() error { func retryExponential(ctx context.Context, sleep time.Duration, f func() error) func() error {
@ -23,6 +23,7 @@ func retryExponential(ctx context.Context, sleep time.Duration, f func() error)
if err != nil { if err != nil {
select { select {
case <-ctx.Done(): case <-ctx.Done():
err = ErrRetryContextDone
case <-time.After(sleep): case <-time.After(sleep):
sleep *= 2 sleep *= 2
} }
@ -39,7 +40,7 @@ func retryConstant(ctx context.Context, sleep time.Duration, f func() error) fun
if err != nil { if err != nil {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return fmt.Errorf("stopped retrying: %w", ctx.Err()) return ErrRetryContextDone
case <-time.After(sleep): case <-time.After(sleep):
} }
} }
@ -53,7 +54,7 @@ func retryAttempts(maxAttempts int, f func() error) (err error) {
err = f() err = f()
if err == nil { if err == nil {
return nil return nil
} else if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { } else if errors.Is(err, ErrRetryContextDone) {
return err return err
} }
log.WithField("count", i).WithError(err).Debug("retrying after error") log.WithField("count", i).WithError(err).Debug("retrying after error")