Refactor Runner Destroy Reason Masking

and ignore expected reasons such when the runner got destroyed by an API request.
This commit is contained in:
Maximilian Paß
2023-07-21 17:42:49 +01:00
parent 102b3f0701
commit eb818f92f7
3 changed files with 11 additions and 15 deletions

View File

@ -5,6 +5,7 @@ import (
"encoding/json"
"errors"
"github.com/gorilla/websocket"
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/internal/runner"
"github.com/openHPI/poseidon/pkg/dto"
"io"
@ -81,17 +82,22 @@ func (cw *codeOceanOutputWriter) StdErr() io.Writer {
// Close forwards the kind of exit (timeout, error, normal) to CodeOcean.
// This results in the closing of the WebSocket connection.
func (cw *codeOceanOutputWriter) Close(info *runner.ExitInfo) {
// Mask the internal stop reason before disclosing/forwarding it externally/to CodeOcean.
switch {
case info.Err == nil:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
case errors.Is(info.Err, context.DeadlineExceeded) || errors.Is(info.Err, runner.ErrorRunnerInactivityTimeout):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout})
case errors.Is(info.Err, runner.ErrOOMKilled):
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: runner.ErrOOMKilled.Error()})
case info.Err != nil:
case errors.Is(info.Err, nomad.ErrorAllocationCompleted), errors.Is(info.Err, runner.ErrDestroyedByAPIRequest):
message := "the allocation stopped as expected"
log.WithContext(cw.ctx).WithError(info.Err).Debug(message)
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: message})
default:
errorMessage := "Error executing the request"
log.WithContext(cw.ctx).WithError(info.Err).Warn(errorMessage)
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketOutputError, Data: errorMessage})
default:
cw.send(&dto.WebSocketMessage{Type: dto.WebSocketExit, ExitCode: info.Code})
}
}

View File

@ -189,15 +189,6 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error)
r, stillActive := m.usedRunners.Get(runnerID)
if stillActive {
// Mask the internal stop reason because the runner might disclose/forward it to CodeOcean/externally.
switch {
case errors.Is(reason, nomad.ErrorOOMKilled):
reason = ErrOOMKilled
default:
log.WithField(dto.KeyRunnerID, runnerID).WithField("reason", reason).Debug("Internal reason for allocation stop")
reason = ErrAllocationStopped
}
m.usedRunners.Delete(runnerID)
if err := r.Destroy(reason); err != nil {
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")

View File

@ -41,7 +41,6 @@ var (
ErrorUnknownExecution = errors.New("unknown execution")
ErrorFileCopyFailed = errors.New("file copy failed")
ErrFileNotFound = errors.New("file not found or insufficient permissions")
ErrAllocationStopped DestroyReason = errors.New("the allocation stopped")
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
@ -244,7 +243,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
err = r.onDestroy(r)
}
if err == nil && (!errors.Is(reason, ErrAllocationStopped) || !errors.Is(reason, ErrOOMKilled)) {
if err == nil && !errors.Is(reason, ErrOOMKilled) {
err = util.RetryExponential(time.Second, func() (err error) {
if err = r.api.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
@ -331,7 +330,7 @@ func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- E
exit <- ExitInfo{255, err}
// This condition prevents further interaction with a stopped / dead allocation.
if errors.Is(err, ErrAllocationStopped) || errors.Is(err, ErrOOMKilled) {
if errors.Is(err, nomad.ErrorOOMKilled) {
return
}