Refactor Runner Destroy Reason Masking

and ignore expected reasons such when the runner got destroyed by an API request.
This commit is contained in:
Maximilian Paß
2023-07-21 17:42:49 +01:00
parent 102b3f0701
commit eb818f92f7
3 changed files with 11 additions and 15 deletions

View File

@ -5,6 +5,7 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"github.com/gorilla/websocket" "github.com/gorilla/websocket"
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/internal/runner" "github.com/openHPI/poseidon/internal/runner"
"github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/dto"
"io" "io"
@ -81,17 +82,22 @@ func (cw *codeOceanOutputWriter) StdErr() io.Writer {
// Close forwards the kind of exit (timeout, error, normal) to CodeOcean. // Close forwards the kind of exit (timeout, error, normal) to CodeOcean.
// This results in the closing of the WebSocket connection. // This results in the closing of the WebSocket connection.
func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) { func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
// Mask the internal stop reason before disclosing/forwarding it externally/to CodeOcean.
switch { switch {
case info.Err == nil:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout): case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout}) cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout})
case errors.Is(info.Err, runner.ErrOOMKilled): case errors.Is(info.Err, runner.ErrOOMKilled):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: runner.ErrOOMKilled.Error()}) cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: runner.ErrOOMKilled.Error()})
case info.Err != nil: case errors.Is(info.Err, nomad.ErrorAllocationCompleted), errors.Is(info.Err, runner.ErrDestroyedByAPIRequest):
message := "the allocation stopped as expected"
log.WithContext(cw.ctx).WithError(info.Err).Debug(message)
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: message})
default:
errorMessage := "Error executing the request" errorMessage := "Error executing the request"
log.WithContext(cw.ctx).WithError(info.Err).Warn(errorMessage) log.WithContext(cw.ctx).WithError(info.Err).Warn(errorMessage)
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: errorMessage}) cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: errorMessage})
default:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
} }
} }

View File

@ -189,15 +189,6 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error)
r, stillActive := m.usedRunners.Get(runnerID) r, stillActive := m.usedRunners.Get(runnerID)
if stillActive { if stillActive {
// Mask the internal stop reason because the runner might disclose/forward it to CodeOcean/externally.
switch {
case errors.Is(reason, nomad.ErrorOOMKilled):
reason = ErrOOMKilled
default:
log.WithField(dto.KeyRunnerID, runnerID).WithField("reason", reason).Debug("Internal reason for allocation stop")
reason = ErrAllocationStopped
}
m.usedRunners.Delete(runnerID) m.usedRunners.Delete(runnerID)
if err := r.Destroy(reason); err != nil { if err := r.Destroy(reason); err != nil {
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed") log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")

View File

@ -41,7 +41,6 @@ var (
ErrorUnknownExecution = errors.New("unknown execution") ErrorUnknownExecution = errors.New("unknown execution")
ErrorFileCopyFailed = errors.New("file copy failed") ErrorFileCopyFailed = errors.New("file copy failed")
ErrFileNotFound = errors.New("file not found or insufficient permissions") ErrFileNotFound = errors.New("file not found or insufficient permissions")
ErrAllocationStopped DestroyReason = errors.New("the allocation stopped")
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner") ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT") ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
@ -244,7 +243,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
err = r.onDestroy(r) err = r.onDestroy(r)
} }
if err == nil && (!errors.Is(reason, ErrAllocationStopped) || !errors.Is(reason, ErrOOMKilled)) { if err == nil && !errors.Is(reason, ErrOOMKilled) {
err = util.RetryExponential(time.Second, func() (err error) { err = util.RetryExponential(time.Second, func() (err error) {
if err = r.api.DeleteJob(r.ID()); err != nil { if err = r.api.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err) err = fmt.Errorf("error deleting runner in Nomad: %w", err)
@ -331,7 +330,7 @@ func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- E
exit <- ExitInfo{255, err} exit <- ExitInfo{255, err}
// This condition prevents further interaction with a stopped / dead allocation. // This condition prevents further interaction with a stopped / dead allocation.
if errors.Is(err, ErrAllocationStopped) || errors.Is(err, ErrOOMKilled) { if errors.Is(err, nomad.ErrorOOMKilled) {
return return
} }