Refactor Runner Destroy Reason Masking
and ignore expected reasons such when the runner got destroyed by an API request.
This commit is contained in:
@ -5,6 +5,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"github.com/gorilla/websocket"
|
"github.com/gorilla/websocket"
|
||||||
|
"github.com/openHPI/poseidon/internal/nomad"
|
||||||
"github.com/openHPI/poseidon/internal/runner"
|
"github.com/openHPI/poseidon/internal/runner"
|
||||||
"github.com/openHPI/poseidon/pkg/dto"
|
"github.com/openHPI/poseidon/pkg/dto"
|
||||||
"io"
|
"io"
|
||||||
@ -81,17 +82,22 @@ func (cw *codeOceanOutputWriter) StdErr() io.Writer {
|
|||||||
// Close forwards the kind of exit (timeout, error, normal) to CodeOcean.
|
// Close forwards the kind of exit (timeout, error, normal) to CodeOcean.
|
||||||
// This results in the closing of the WebSocket connection.
|
// This results in the closing of the WebSocket connection.
|
||||||
func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
|
func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
|
||||||
|
// Mask the internal stop reason before disclosing/forwarding it externally/to CodeOcean.
|
||||||
switch {
|
switch {
|
||||||
|
case info.Err == nil:
|
||||||
|
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
|
||||||
case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout):
|
case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout):
|
||||||
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout})
|
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout})
|
||||||
case errors.Is(info.Err, runner.ErrOOMKilled):
|
case errors.Is(info.Err, runner.ErrOOMKilled):
|
||||||
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: runner.ErrOOMKilled.Error()})
|
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: runner.ErrOOMKilled.Error()})
|
||||||
case info.Err != nil:
|
case errors.Is(info.Err, nomad.ErrorAllocationCompleted), errors.Is(info.Err, runner.ErrDestroyedByAPIRequest):
|
||||||
|
message := "the allocation stopped as expected"
|
||||||
|
log.WithContext(cw.ctx).WithError(info.Err).Debug(message)
|
||||||
|
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: message})
|
||||||
|
default:
|
||||||
errorMessage := "Error executing the request"
|
errorMessage := "Error executing the request"
|
||||||
log.WithContext(cw.ctx).WithError(info.Err).Warn(errorMessage)
|
log.WithContext(cw.ctx).WithError(info.Err).Warn(errorMessage)
|
||||||
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: errorMessage})
|
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: errorMessage})
|
||||||
default:
|
|
||||||
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -189,15 +189,6 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error)
|
|||||||
|
|
||||||
r, stillActive := m.usedRunners.Get(runnerID)
|
r, stillActive := m.usedRunners.Get(runnerID)
|
||||||
if stillActive {
|
if stillActive {
|
||||||
// Mask the internal stop reason because the runner might disclose/forward it to CodeOcean/externally.
|
|
||||||
switch {
|
|
||||||
case errors.Is(reason, nomad.ErrorOOMKilled):
|
|
||||||
reason = ErrOOMKilled
|
|
||||||
default:
|
|
||||||
log.WithField(dto.KeyRunnerID, runnerID).WithField("reason", reason).Debug("Internal reason for allocation stop")
|
|
||||||
reason = ErrAllocationStopped
|
|
||||||
}
|
|
||||||
|
|
||||||
m.usedRunners.Delete(runnerID)
|
m.usedRunners.Delete(runnerID)
|
||||||
if err := r.Destroy(reason); err != nil {
|
if err := r.Destroy(reason); err != nil {
|
||||||
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
|
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
|
||||||
|
@ -41,7 +41,6 @@ var (
|
|||||||
ErrorUnknownExecution = errors.New("unknown execution")
|
ErrorUnknownExecution = errors.New("unknown execution")
|
||||||
ErrorFileCopyFailed = errors.New("file copy failed")
|
ErrorFileCopyFailed = errors.New("file copy failed")
|
||||||
ErrFileNotFound = errors.New("file not found or insufficient permissions")
|
ErrFileNotFound = errors.New("file not found or insufficient permissions")
|
||||||
ErrAllocationStopped DestroyReason = errors.New("the allocation stopped")
|
|
||||||
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
|
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
|
||||||
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
|
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
|
||||||
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
|
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
|
||||||
@ -244,7 +243,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
|
|||||||
err = r.onDestroy(r)
|
err = r.onDestroy(r)
|
||||||
}
|
}
|
||||||
|
|
||||||
if err == nil && (!errors.Is(reason, ErrAllocationStopped) || !errors.Is(reason, ErrOOMKilled)) {
|
if err == nil && !errors.Is(reason, ErrOOMKilled) {
|
||||||
err = util.RetryExponential(time.Second, func() (err error) {
|
err = util.RetryExponential(time.Second, func() (err error) {
|
||||||
if err = r.api.DeleteJob(r.ID()); err != nil {
|
if err = r.api.DeleteJob(r.ID()); err != nil {
|
||||||
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
|
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
|
||||||
@ -331,7 +330,7 @@ func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- E
|
|||||||
exit <- ExitInfo{255, err}
|
exit <- ExitInfo{255, err}
|
||||||
|
|
||||||
// This condition prevents further interaction with a stopped / dead allocation.
|
// This condition prevents further interaction with a stopped / dead allocation.
|
||||||
if errors.Is(err, ErrAllocationStopped) || errors.Is(err, ErrOOMKilled) {
|
if errors.Is(err, nomad.ErrorOOMKilled) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user