Introduce reason for destroying runner
in order to return a specific error for OOM Killed Executions.
This commit is contained in:

committed by
Sebastian Serth

parent
b3fedf274c
commit
6a1677dea0
@@ -61,7 +61,7 @@ func NewAWSFunctionWorkload(
|
||||
workload.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
|
||||
monitoring.MeasurementExecutionsAWS, monitorExecutionsRunnerID(environment.ID(), workload.id), time.Minute, ctx)
|
||||
workload.InactivityTimer = NewInactivityTimer(workload, func(_ Runner) error {
|
||||
return workload.Destroy(false)
|
||||
return workload.Destroy(nil)
|
||||
})
|
||||
return workload, nil
|
||||
}
|
||||
@@ -136,7 +136,7 @@ func (w *AWSFunctionWorkload) GetFileContent(_ string, _ http.ResponseWriter, _
|
||||
return dto.ErrNotSupported
|
||||
}
|
||||
|
||||
func (w *AWSFunctionWorkload) Destroy(_ bool) error {
|
||||
func (w *AWSFunctionWorkload) Destroy(_ DestroyReason) error {
|
||||
w.cancel()
|
||||
if err := w.onDestroy(w); err != nil {
|
||||
return fmt.Errorf("error while destroying aws runner: %w", err)
|
||||
|
@@ -147,7 +147,8 @@ func TestAWSFunctionWorkload_Destroy(t *testing.T) {
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
err = r.Destroy(false)
|
||||
var reason error
|
||||
err = r.Destroy(reason)
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, hasDestroyBeenCalled)
|
||||
}
|
||||
|
@@ -31,7 +31,7 @@ const (
|
||||
TimerExpired TimerState = 2
|
||||
)
|
||||
|
||||
var ErrorRunnerInactivityTimeout = errors.New("runner inactivity timeout exceeded")
|
||||
var ErrorRunnerInactivityTimeout DestroyReason = errors.New("runner inactivity timeout exceeded")
|
||||
|
||||
type InactivityTimerImplementation struct {
|
||||
timer *time.Timer
|
||||
|
@@ -71,7 +71,7 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int
|
||||
|
||||
func (m *NomadRunnerManager) Return(r Runner) error {
|
||||
m.usedRunners.Delete(r.ID())
|
||||
err := r.Destroy(false)
|
||||
err := r.Destroy(ErrDestroyedByAPIRequest)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot return runner: %w", err)
|
||||
}
|
||||
@@ -174,7 +174,7 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
|
||||
}
|
||||
|
||||
// onAllocationStopped is the callback for when Nomad stopped an allocation.
|
||||
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
|
||||
func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error) (alreadyRemoved bool) {
|
||||
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
|
||||
|
||||
if nomad.IsEnvironmentTemplateID(runnerID) {
|
||||
@@ -189,8 +189,17 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemove
|
||||
|
||||
r, stillActive := m.usedRunners.Get(runnerID)
|
||||
if stillActive {
|
||||
// Mask the internal stop reason because the runner might disclose/forward it to CodeOcean/externally.
|
||||
switch {
|
||||
case errors.Is(reason, nomad.ErrorOOMKilled):
|
||||
reason = ErrOOMKilled
|
||||
default:
|
||||
log.WithField(dto.KeyRunnerID, runnerID).WithField("reason", reason).Debug("Internal reason for allocation stop")
|
||||
reason = ErrAllocationStopped
|
||||
}
|
||||
|
||||
m.usedRunners.Delete(runnerID)
|
||||
if err := r.Destroy(true); err != nil {
|
||||
if err := r.Destroy(reason); err != nil {
|
||||
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
|
||||
}
|
||||
}
|
||||
|
@@ -259,7 +259,8 @@ func (s *ManagerTestSuite) TestUpdateRunnersRemovesIdleAndUsedRunner() {
|
||||
s.Require().True(ok)
|
||||
mockIdleRunners(environment.(*ExecutionEnvironmentMock))
|
||||
|
||||
testRunner := NewNomadJob(allocation.JobID, nil, nil, s.nomadRunnerManager.onRunnerDestroyed)
|
||||
testRunner := NewNomadJob(allocation.JobID, nil, s.apiMock, s.nomadRunnerManager.onRunnerDestroyed)
|
||||
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
|
||||
environment.AddRunner(testRunner)
|
||||
s.nomadRunnerManager.usedRunners.Add(testRunner.ID(), testRunner)
|
||||
|
||||
@@ -267,7 +268,7 @@ func (s *ManagerTestSuite) TestUpdateRunnersRemovesIdleAndUsedRunner() {
|
||||
call.Run(func(args mock.Arguments) {
|
||||
callbacks, ok := args.Get(1).(*nomad.AllocationProcessing)
|
||||
s.Require().True(ok)
|
||||
callbacks.OnDeleted(allocation.JobID)
|
||||
callbacks.OnDeleted(allocation.JobID, nil)
|
||||
call.ReturnArguments = mock.Arguments{nil}
|
||||
})
|
||||
})
|
||||
@@ -400,7 +401,7 @@ func testStoppedInactivityTimer(s *ManagerTestSuite, stopAllocation bool) {
|
||||
s.Require().False(runnerDestroyed)
|
||||
|
||||
if stopAllocation {
|
||||
alreadyRemoved := s.nomadRunnerManager.onAllocationStopped(runner.ID())
|
||||
alreadyRemoved := s.nomadRunnerManager.onAllocationStopped(runner.ID(), nil)
|
||||
s.False(alreadyRemoved)
|
||||
}
|
||||
|
||||
|
@@ -25,6 +25,8 @@ import (
|
||||
const (
|
||||
// runnerContextKey is the key used to store runners in context.Context.
|
||||
runnerContextKey dto.ContextKey = "runner"
|
||||
// destroyReasonContextKey is the key used to store the reason of the destruction in the context.Context.
|
||||
destroyReasonContextKey dto.ContextKey = "destroyReason"
|
||||
// SIGQUIT is the character that causes a tty to send the SIGQUIT signal to the controlled process.
|
||||
SIGQUIT = 0x1c
|
||||
// executionTimeoutGracePeriod is the time to wait after sending a SIGQUIT signal to a timed out execution.
|
||||
@@ -36,9 +38,13 @@ const (
|
||||
)
|
||||
|
||||
var (
|
||||
ErrorUnknownExecution = errors.New("unknown execution")
|
||||
ErrorFileCopyFailed = errors.New("file copy failed")
|
||||
ErrFileNotFound = errors.New("file not found or insufficient permissions")
|
||||
ErrorUnknownExecution = errors.New("unknown execution")
|
||||
ErrorFileCopyFailed = errors.New("file copy failed")
|
||||
ErrFileNotFound = errors.New("file not found or insufficient permissions")
|
||||
ErrAllocationStopped DestroyReason = errors.New("the allocation stopped")
|
||||
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
|
||||
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
|
||||
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
|
||||
)
|
||||
|
||||
// NomadJob is an abstraction to communicate with Nomad environments.
|
||||
@@ -71,7 +77,7 @@ func NewNomadJob(id string, portMappings []nomadApi.PortMapping,
|
||||
job.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
|
||||
monitoring.MeasurementExecutionsNomad, monitorExecutionsRunnerID(job.Environment(), id), time.Minute, ctx)
|
||||
job.InactivityTimer = NewInactivityTimer(job, func(r Runner) error {
|
||||
err := r.Destroy(false)
|
||||
err := r.Destroy(ErrorRunnerInactivityTimeout)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("NomadJob: %w", err)
|
||||
}
|
||||
@@ -230,14 +236,15 @@ func (r *NomadJob) GetFileContent(
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *NomadJob) Destroy(local bool) (err error) {
|
||||
func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
|
||||
r.ctx = context.WithValue(r.ctx, destroyReasonContextKey, reason)
|
||||
r.cancel()
|
||||
r.StopTimeout()
|
||||
if r.onDestroy != nil {
|
||||
err = r.onDestroy(r)
|
||||
}
|
||||
|
||||
if !local && err == nil {
|
||||
if err == nil && (!errors.Is(reason, ErrAllocationStopped) || !errors.Is(reason, ErrOOMKilled)) {
|
||||
err = util.RetryExponential(time.Second, func() (err error) {
|
||||
if err = r.api.DeleteJob(r.ID()); err != nil {
|
||||
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
|
||||
@@ -290,14 +297,22 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co
|
||||
}
|
||||
|
||||
err := ctx.Err()
|
||||
if r.TimeoutPassed() {
|
||||
err = ErrorRunnerInactivityTimeout
|
||||
if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok {
|
||||
err = reason
|
||||
if r.TimeoutPassed() && !errors.Is(err, ErrorRunnerInactivityTimeout) {
|
||||
log.WithError(err).Warn("Wrong destroy reason for expired runner")
|
||||
}
|
||||
}
|
||||
|
||||
// From this time on the WebSocket connection to the client is closed in /internal/api/websocket.go
|
||||
// waitForExit. Input can still be sent to the executor.
|
||||
exit <- ExitInfo{255, err}
|
||||
|
||||
// This condition prevents further interaction with a stopped / dead allocation.
|
||||
if errors.Is(err, ErrAllocationStopped) || errors.Is(err, ErrOOMKilled) {
|
||||
return
|
||||
}
|
||||
|
||||
// This injects the SIGQUIT character into the stdin. This character is parsed by the tty line discipline
|
||||
// (tty has to be true) and converted to a SIGQUIT signal sent to the foreground process attached to the tty.
|
||||
// By default, SIGQUIT causes the process to terminate and produces a core dump. Processes can catch this signal
|
||||
@@ -318,8 +333,8 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co
|
||||
case <-exitInternal:
|
||||
log.WithContext(ctx).Debug("Execution terminated after SIGQUIT")
|
||||
case <-time.After(executionTimeoutGracePeriod):
|
||||
log.WithContext(ctx).Info("Execution did not quit after SIGQUIT")
|
||||
if err := r.Destroy(false); err != nil {
|
||||
log.WithContext(ctx).Info(ErrCannotStopExecution.Error())
|
||||
if err := r.Destroy(ErrCannotStopExecution); err != nil {
|
||||
log.WithContext(ctx).Error("Error when destroying runner")
|
||||
}
|
||||
}
|
||||
|
@@ -114,12 +114,14 @@ func (s *ExecuteInteractivelyTestSuite) SetupTest() {
|
||||
s.manager = &ManagerMock{}
|
||||
s.manager.On("Return", mock.Anything).Return(nil)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
s.runner = &NomadJob{
|
||||
executions: storage.NewLocalStorage[*dto.ExecutionRequest](),
|
||||
InactivityTimer: s.timer,
|
||||
id: tests.DefaultRunnerID,
|
||||
api: s.apiMock,
|
||||
ctx: context.Background(),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -235,12 +237,30 @@ func (s *ExecuteInteractivelyTestSuite) TestExitHasTimeoutErrorIfRunnerTimesOut(
|
||||
executionRequest := &dto.ExecutionRequest{}
|
||||
s.runner.StoreExecution(defaultExecutionID, executionRequest)
|
||||
|
||||
exitChannel, cancel, err := s.runner.ExecuteInteractively(
|
||||
exitChannel, _, err := s.runner.ExecuteInteractively(
|
||||
defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background())
|
||||
s.Require().NoError(err)
|
||||
cancel()
|
||||
err = s.runner.Destroy(ErrorRunnerInactivityTimeout)
|
||||
s.Require().NoError(err)
|
||||
exit := <-exitChannel
|
||||
s.Equal(ErrorRunnerInactivityTimeout, exit.Err)
|
||||
s.ErrorIs(exit.Err, ErrorRunnerInactivityTimeout)
|
||||
}
|
||||
|
||||
func (s *ExecuteInteractivelyTestSuite) TestDestroyReasonIsPassedToExecution() {
|
||||
s.mockedExecuteCommandCall.Run(func(args mock.Arguments) {
|
||||
select {}
|
||||
}).Return(0, nil)
|
||||
s.mockedTimeoutPassedCall.Return(true)
|
||||
executionRequest := &dto.ExecutionRequest{}
|
||||
s.runner.StoreExecution(defaultExecutionID, executionRequest)
|
||||
|
||||
exitChannel, _, err := s.runner.ExecuteInteractively(
|
||||
defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background())
|
||||
s.Require().NoError(err)
|
||||
err = s.runner.Destroy(ErrOOMKilled)
|
||||
s.Require().NoError(err)
|
||||
exit := <-exitChannel
|
||||
s.ErrorIs(exit.Err, ErrOOMKilled)
|
||||
}
|
||||
|
||||
func TestUpdateFileSystemTestSuite(t *testing.T) {
|
||||
|
@@ -17,6 +17,9 @@ type ExitInfo struct {
|
||||
|
||||
type DestroyRunnerHandler = func(r Runner) error
|
||||
|
||||
// DestroyReason specifies errors that are expected as reason for destroying a runner.
|
||||
type DestroyReason error
|
||||
|
||||
type Runner interface {
|
||||
InactivityTimer
|
||||
|
||||
@@ -59,8 +62,8 @@ type Runner interface {
|
||||
GetFileContent(path string, content http.ResponseWriter, privilegedExecution bool, ctx context.Context) error
|
||||
|
||||
// Destroy destroys the Runner in Nomad.
|
||||
// Iff local is true, the destruction will not be propagated to external systems.
|
||||
Destroy(local bool) error
|
||||
// Depending on the reason special cases of the Destruction will be handled.
|
||||
Destroy(reason DestroyReason) error
|
||||
}
|
||||
|
||||
// NewContext creates a context containing a runner.
|
||||
|
@@ -1,4 +1,4 @@
|
||||
// Code generated by mockery v2.30.1. DO NOT EDIT.
|
||||
// Code generated by mockery v2.30.16. DO NOT EDIT.
|
||||
|
||||
package runner
|
||||
|
||||
@@ -20,13 +20,13 @@ type RunnerMock struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
// Destroy provides a mock function with given fields: local
|
||||
func (_m *RunnerMock) Destroy(local bool) error {
|
||||
ret := _m.Called(local)
|
||||
// Destroy provides a mock function with given fields: reason
|
||||
func (_m *RunnerMock) Destroy(reason DestroyReason) error {
|
||||
ret := _m.Called(reason)
|
||||
|
||||
var r0 error
|
||||
if rf, ok := ret.Get(0).(func(bool) error); ok {
|
||||
r0 = rf(local)
|
||||
if rf, ok := ret.Get(0).(func(DestroyReason) error); ok {
|
||||
r0 = rf(reason)
|
||||
} else {
|
||||
r0 = ret.Error(0)
|
||||
}
|
||||
|
Reference in New Issue
Block a user