Fix OOM Kill race condition
due to the Nomad request exiting before the allocation is stopped. We catch this behavior by introducing a time period for the allocation being stopped iff the exit code is 128.
This commit is contained in:

committed by
Sebastian Serth

parent
6a1677dea0
commit
8ef5f4e7c5
@ -288,14 +288,36 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co
|
|||||||
case exitInfo := <-exitInternal:
|
case exitInfo := <-exitInternal:
|
||||||
// - The execution ended in time or
|
// - The execution ended in time or
|
||||||
// - the HTTP request of the client/CodeOcean got canceled.
|
// - the HTTP request of the client/CodeOcean got canceled.
|
||||||
exit <- exitInfo
|
r.handleExit(exitInfo, exitInternal, exit, stdin, ctx)
|
||||||
return
|
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
// - The execution timeout was exceeded,
|
// - The execution timeout was exceeded,
|
||||||
// - the runner was destroyed (runner timeout, or API delete request), or
|
// - the runner was destroyed (runner timeout, or API delete request), or
|
||||||
// - the WebSocket connection to the client/CodeOcean closed.
|
// - the WebSocket connection to the client/CodeOcean closed.
|
||||||
|
r.handleContextDone(exitInternal, exit, stdin, ctx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *NomadJob) handleExit(exitInfo ExitInfo, exitInternal <-chan ExitInfo, exit chan<- ExitInfo,
|
||||||
|
stdin io.ReadWriter, ctx context.Context) {
|
||||||
|
// Special Handling of OOM Killed allocations. See #401.
|
||||||
|
const exitCodeOfLikelyOOMKilledAllocation = 128
|
||||||
|
const gracePeriodForConfirmingOOMKillReason = time.Second
|
||||||
|
if exitInfo.Code == exitCodeOfLikelyOOMKilledAllocation {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
// We consider this allocation to be OOM Killed.
|
||||||
|
r.handleContextDone(exitInternal, exit, stdin, ctx)
|
||||||
|
return
|
||||||
|
case <-time.After(gracePeriodForConfirmingOOMKillReason):
|
||||||
|
// We consider that the payload code returned the exit code.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
exit <- exitInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *NomadJob) handleContextDone(exitInternal <-chan ExitInfo, exit chan<- ExitInfo,
|
||||||
|
stdin io.ReadWriter, ctx context.Context) {
|
||||||
err := ctx.Err()
|
err := ctx.Err()
|
||||||
if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok {
|
if reason, ok := r.ctx.Value(destroyReasonContextKey).(error); ok {
|
||||||
err = reason
|
err = reason
|
||||||
|
@ -263,6 +263,47 @@ func (s *ExecuteInteractivelyTestSuite) TestDestroyReasonIsPassedToExecution() {
|
|||||||
s.ErrorIs(exit.Err, ErrOOMKilled)
|
s.ErrorIs(exit.Err, ErrOOMKilled)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ExecuteInteractivelyTestSuite) TestSuspectedOOMKilledExecutionWaitsForVerification() {
|
||||||
|
s.mockedExecuteCommandCall.Return(128, nil)
|
||||||
|
executionRequest := &dto.ExecutionRequest{}
|
||||||
|
s.Run("Actually OOM Killed", func() {
|
||||||
|
s.runner.StoreExecution(defaultExecutionID, executionRequest)
|
||||||
|
exitChannel, _, err := s.runner.ExecuteInteractively(
|
||||||
|
defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background())
|
||||||
|
s.Require().NoError(err)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-exitChannel:
|
||||||
|
s.FailNow("For exit code 128 Poseidon should wait a while to verify the OOM Kill assumption.")
|
||||||
|
case <-time.After(tests.ShortTimeout):
|
||||||
|
// All good. Poseidon waited.
|
||||||
|
}
|
||||||
|
|
||||||
|
err = s.runner.Destroy(ErrOOMKilled)
|
||||||
|
s.Require().NoError(err)
|
||||||
|
exit := <-exitChannel
|
||||||
|
s.ErrorIs(exit.Err, ErrOOMKilled)
|
||||||
|
})
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
s.runner.ctx = ctx
|
||||||
|
s.runner.cancel = cancel
|
||||||
|
s.Run("Not OOM Killed", func() {
|
||||||
|
s.runner.StoreExecution(defaultExecutionID, executionRequest)
|
||||||
|
exitChannel, _, err := s.runner.ExecuteInteractively(
|
||||||
|
defaultExecutionID, &nullio.ReadWriter{}, nil, nil, context.Background())
|
||||||
|
s.Require().NoError(err)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-time.After(tests.ShortTimeout + time.Second):
|
||||||
|
s.FailNow("Poseidon should not wait too long for verifying the OOM Kill assumption.")
|
||||||
|
case exit := <-exitChannel:
|
||||||
|
s.Equal(uint8(128), exit.Code)
|
||||||
|
s.Nil(exit.Err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func TestUpdateFileSystemTestSuite(t *testing.T) {
|
func TestUpdateFileSystemTestSuite(t *testing.T) {
|
||||||
suite.Run(t, new(UpdateFileSystemTestSuite))
|
suite.Run(t, new(UpdateFileSystemTestSuite))
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user