Destroy runner on allocation stopped

Destroying the runner when Nomad informs us about its allocation being stopped, fixes the error of executions running into their timeout even if the allocation was stopped long ago.
This commit is contained in:
Maximilian Paß
2023-06-26 17:03:52 +01:00
committed by Sebastian Serth
parent 40a5f2eca6
commit bfb5977d24
10 changed files with 104 additions and 74 deletions

View File

@ -61,7 +61,7 @@ func NewAWSFunctionWorkload(
workload.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
monitoring.MeasurementExecutionsAWS, monitorExecutionsRunnerID(environment.ID(), workload.id), time.Minute, ctx)
workload.InactivityTimer = NewInactivityTimer(workload, func(_ Runner) error {
return workload.Destroy()
return workload.Destroy(false)
})
return workload, nil
}
@ -136,7 +136,7 @@ func (w *AWSFunctionWorkload) GetFileContent(_ string, _ http.ResponseWriter, _
return dto.ErrNotSupported
}
func (w *AWSFunctionWorkload) Destroy() error {
func (w *AWSFunctionWorkload) Destroy(_ bool) error {
w.cancel()
if err := w.onDestroy(w); err != nil {
return fmt.Errorf("error while destroying aws runner: %w", err)

View File

@ -147,7 +147,7 @@ func TestAWSFunctionWorkload_Destroy(t *testing.T) {
})
require.NoError(t, err)
err = r.Destroy()
err = r.Destroy(false)
assert.NoError(t, err)
assert.True(t, hasDestroyBeenCalled)
}

View File

@ -1,8 +1,8 @@
package runner
import (
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/tests"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"testing"
"time"
@ -15,18 +15,17 @@ func TestInactivityTimerTestSuite(t *testing.T) {
type InactivityTimerTestSuite struct {
suite.Suite
runner Runner
manager *ManagerMock
returned chan bool
}
func (s *InactivityTimerTestSuite) SetupTest() {
s.returned = make(chan bool, 1)
s.manager = &ManagerMock{}
s.manager.On("Return", mock.Anything).Run(func(_ mock.Arguments) {
apiMock := &nomad.ExecutorAPIMock{}
apiMock.On("DeleteJob", tests.DefaultRunnerID).Return(nil)
s.runner = NewNomadJob(tests.DefaultRunnerID, nil, apiMock, func(_ Runner) error {
s.returned <- true
}).Return(nil)
s.runner = NewRunner(tests.DefaultRunnerID, s.manager)
return nil
})
s.runner.SetupTimeout(tests.ShortTimeout)
}

View File

@ -71,17 +71,11 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int
func (m *NomadRunnerManager) Return(r Runner) error {
m.usedRunners.Delete(r.ID())
r.StopTimeout()
err := util.RetryExponential(time.Second, func() (err error) {
if err = m.apiClient.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
}
return
})
err := r.Destroy(false)
if err != nil {
return fmt.Errorf("%w", err)
err = fmt.Errorf("cannot return runner: %w", err)
}
return nil
return err
}
func (m *NomadRunnerManager) Load() {
@ -114,7 +108,7 @@ func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger
environmentLogger.WithError(err).Warn("Error loading runner portMappings")
return
}
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.Return)
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.onRunnerDestroyed)
log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner")
if isUsed {
m.usedRunners.Add(newJob.ID(), newJob)
@ -140,6 +134,7 @@ func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
}
}
// onAllocationAdded is the callback for when Nomad started an allocation.
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, startup time.Duration) {
log.WithField(dto.KeyRunnerID, alloc.JobID).WithField("startupDuration", startup).Debug("Runner started")
@ -165,7 +160,7 @@ func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, start
if alloc.AllocatedResources != nil {
mappedPorts = alloc.AllocatedResources.Shared.Ports
}
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m.Return))
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m.onRunnerDestroyed))
monitorAllocationStartupDuration(startup, alloc.JobID, environmentID)
}
}
@ -178,6 +173,7 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
monitoring.WriteInfluxPoint(p)
}
// onAllocationStopped is the callback for when Nomad stopped an allocation.
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
@ -193,8 +189,10 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemove
r, stillActive := m.usedRunners.Get(runnerID)
if stillActive {
r.StopTimeout()
m.usedRunners.Delete(runnerID)
if err := r.Destroy(true); err != nil {
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
}
}
environment, ok := m.environments.Get(environmentID.ToString())
@ -204,3 +202,15 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemove
return !stillActive
}
// onRunnerDestroyed is the callback when the runner destroys itself.
// The main use of this callback is to remove the runner from the used runners, when its timeout exceeds.
func (m *NomadRunnerManager) onRunnerDestroyed(r Runner) error {
m.usedRunners.Delete(r.ID())
environment, ok := m.environments.Get(r.Environment().ToString())
if ok {
environment.DeleteRunner(r.ID())
}
return nil
}

View File

@ -40,7 +40,7 @@ func (s *ManagerTestSuite) SetupTest() {
cancel()
s.nomadRunnerManager = NewNomadRunnerManager(s.apiMock, ctx)
s.exerciseRunner = NewRunner(tests.DefaultRunnerID, s.nomadRunnerManager)
s.exerciseRunner = NewNomadJob(tests.DefaultRunnerID, nil, s.apiMock, s.nomadRunnerManager.onRunnerDestroyed)
s.exerciseEnvironment = createBasicEnvironmentMock(defaultEnvironmentID)
s.nomadRunnerManager.StoreEnvironment(s.exerciseEnvironment)
}
@ -130,7 +130,7 @@ func (s *ManagerTestSuite) TestClaimReturnsNoRunnerOfDifferentEnvironment() {
func (s *ManagerTestSuite) TestClaimDoesNotReturnTheSameRunnerTwice() {
s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true).Once()
s.exerciseEnvironment.On("Sample", mock.Anything).
Return(NewRunner(tests.AnotherRunnerID, s.nomadRunnerManager), true).Once()
Return(NewNomadJob(tests.AnotherRunnerID, nil, nil, s.nomadRunnerManager.onRunnerDestroyed), true).Once()
firstReceivedRunner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, defaultInactivityTimeout)
s.NoError(err)
@ -150,6 +150,7 @@ func (s *ManagerTestSuite) TestClaimAddsRunnerToUsedRunners() {
func (s *ManagerTestSuite) TestClaimRemovesRunnerWhenMarkAsUsedFails() {
s.exerciseEnvironment.On("Sample", mock.Anything).Return(s.exerciseRunner, true)
s.exerciseEnvironment.On("DeleteRunner", mock.AnythingOfType("string")).Return()
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
util.MaxConnectionRetriesExponential = 1
modifyMockedCall(s.apiMock, "MarkRunnerAsUsed", func(call *mock.Call) {
@ -181,6 +182,7 @@ func (s *ManagerTestSuite) TestGetReturnsErrorIfRunnerNotFound() {
func (s *ManagerTestSuite) TestReturnRemovesRunnerFromUsedRunners() {
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
s.exerciseEnvironment.On("DeleteRunner", mock.AnythingOfType("string")).Return()
s.nomadRunnerManager.usedRunners.Add(s.exerciseRunner.ID(), s.exerciseRunner)
err := s.nomadRunnerManager.Return(s.exerciseRunner)
s.Nil(err)
@ -190,6 +192,7 @@ func (s *ManagerTestSuite) TestReturnRemovesRunnerFromUsedRunners() {
func (s *ManagerTestSuite) TestReturnCallsDeleteRunnerApiMethod() {
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
s.exerciseEnvironment.On("DeleteRunner", mock.AnythingOfType("string")).Return()
err := s.nomadRunnerManager.Return(s.exerciseRunner)
s.Nil(err)
s.apiMock.AssertCalled(s.T(), "DeleteJob", s.exerciseRunner.ID())
@ -197,6 +200,7 @@ func (s *ManagerTestSuite) TestReturnCallsDeleteRunnerApiMethod() {
func (s *ManagerTestSuite) TestReturnReturnsErrorWhenApiCallFailed() {
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(tests.ErrDefault)
s.exerciseEnvironment.On("DeleteRunner", mock.AnythingOfType("string")).Return()
err := s.nomadRunnerManager.Return(s.exerciseRunner)
s.Error(err)
}
@ -255,7 +259,7 @@ func (s *ManagerTestSuite) TestUpdateRunnersRemovesIdleAndUsedRunner() {
s.Require().True(ok)
mockIdleRunners(environment.(*ExecutionEnvironmentMock))
testRunner := NewRunner(allocation.JobID, s.nomadRunnerManager)
testRunner := NewNomadJob(allocation.JobID, nil, nil, s.nomadRunnerManager.onRunnerDestroyed)
environment.AddRunner(testRunner)
s.nomadRunnerManager.usedRunners.Add(testRunner.ID(), testRunner)
@ -378,20 +382,22 @@ func (s *ManagerTestSuite) TestOnAllocationStopped() {
func testStoppedInactivityTimer(s *ManagerTestSuite, stopAllocation bool) {
s.T().Helper()
s.apiMock.On("DeleteJob", tests.DefaultRunnerID).Return(nil)
environment, ok := s.nomadRunnerManager.environments.Get(tests.DefaultEnvironmentIDAsString)
s.Require().True(ok)
mockIdleRunners(environment.(*ExecutionEnvironmentMock))
inactivityTimerCalled := false
runnerDestroyed := false
environment.AddRunner(NewNomadJob(tests.DefaultRunnerID, []nomadApi.PortMapping{}, s.apiMock, func(r Runner) error {
inactivityTimerCalled = true
runnerDestroyed = true
return nil
}))
runner, err := s.nomadRunnerManager.Claim(defaultEnvironmentID, 1)
s.Require().NoError(err)
s.Require().False(runner.TimeoutPassed())
s.Require().False(inactivityTimerCalled)
s.Require().False(runnerDestroyed)
if stopAllocation {
alreadyRemoved := s.nomadRunnerManager.onAllocationStopped(runner.ID())
@ -400,7 +406,7 @@ func testStoppedInactivityTimer(s *ManagerTestSuite, stopAllocation bool) {
<-time.After(time.Second + tests.ShortTimeout)
s.NotEqual(stopAllocation, runner.TimeoutPassed())
s.NotEqual(stopAllocation, inactivityTimerCalled)
s.True(runnerDestroyed)
}
func TestNomadRunnerManager_Load(t *testing.T) {
@ -449,6 +455,7 @@ func TestNomadRunnerManager_Load(t *testing.T) {
runnerManager.usedRunners.Purge()
t.Run("Restart timeout of used runner", func(t *testing.T) {
apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
environmentMock.On("DeleteRunner", mock.AnythingOfType("string")).Once().Return()
timeout := 1
_, job := helpers.CreateTemplateJob()

View File

@ -15,6 +15,7 @@ import (
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/nullio"
"github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util"
"io"
"net/http"
"strings"
@ -69,7 +70,13 @@ func NewNomadJob(id string, portMappings []nomadApi.PortMapping,
}
job.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
monitoring.MeasurementExecutionsNomad, monitorExecutionsRunnerID(job.Environment(), id), time.Minute, ctx)
job.InactivityTimer = NewInactivityTimer(job, onDestroy)
job.InactivityTimer = NewInactivityTimer(job, func(r Runner) error {
err := r.Destroy(false)
if err != nil {
err = fmt.Errorf("NomadJob: %w", err)
}
return err
})
return job
}
@ -120,7 +127,7 @@ func (r *NomadJob) ExecuteInteractively(
// We have to handle three contexts
// - requestCtx: The context of the http request (including Sentry data)
// - r.ctx: The context of the runner (runner timeout)
// - r.ctx: The context of the runner (runner timeout, or runner destroyed)
// - executionCtx: The context of the execution (execution timeout)
// -> The executionCtx cancel that might be triggered (when the client connection breaks)
@ -223,12 +230,26 @@ func (r *NomadJob) GetFileContent(
return nil
}
func (r *NomadJob) Destroy() error {
func (r *NomadJob) Destroy(local bool) (err error) {
r.cancel()
if err := r.onDestroy(r); err != nil {
return fmt.Errorf("error while destroying runner: %w", err)
r.StopTimeout()
if r.onDestroy != nil {
err = r.onDestroy(r)
}
return nil
if !local && err == nil {
err = util.RetryExponential(time.Second, func() (err error) {
if err = r.api.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
}
return
})
}
if err != nil {
err = fmt.Errorf("cannot destroy runner: %w", err)
}
return err
}
func prepareExecution(request *dto.ExecutionRequest, environmentCtx context.Context) (
@ -291,7 +312,7 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co
log.WithContext(ctx).Debug("Execution terminated after SIGQUIT")
case <-time.After(executionTimeoutGracePeriod):
log.WithContext(ctx).Info("Execution did not quit after SIGQUIT")
if err := r.Destroy(); err != nil {
if err := r.Destroy(false); err != nil {
log.WithContext(ctx).Error("Error when destroying runner")
}
}

View File

@ -87,15 +87,6 @@ func TestFromContextReturnsIsNotOkWhenContextHasNoRunner(t *testing.T) {
assert.False(t, ok)
}
func TestDestroyReturnsRunner(t *testing.T) {
manager := &ManagerMock{}
manager.On("Return", mock.Anything).Return(nil)
runner := NewRunner(tests.DefaultRunnerID, manager)
err := runner.Destroy()
assert.NoError(t, err)
manager.AssertCalled(t, "Return", runner)
}
func TestExecuteInteractivelyTestSuite(t *testing.T) {
suite.Run(t, new(ExecuteInteractivelyTestSuite))
}
@ -115,7 +106,9 @@ func (s *ExecuteInteractivelyTestSuite) SetupTest() {
s.mockedExecuteCommandCall = s.apiMock.On("ExecuteCommand", mock.Anything, mock.Anything, mock.Anything,
true, false, mock.Anything, mock.Anything, mock.Anything).
Return(0, nil)
s.apiMock.On("DeleteJob", mock.AnythingOfType("string")).Return(nil)
s.timer = &InactivityTimerMock{}
s.timer.On("StopTimeout").Return()
s.timer.On("ResetTimeout").Return()
s.mockedTimeoutPassedCall = s.timer.On("TimeoutPassed").Return(false)
s.manager = &ManagerMock{}
@ -126,7 +119,6 @@ func (s *ExecuteInteractivelyTestSuite) SetupTest() {
InactivityTimer: s.timer,
id: tests.DefaultRunnerID,
api: s.apiMock,
onDestroy: s.manager.Return,
ctx: context.Background(),
}
}
@ -207,15 +199,24 @@ func (s *ExecuteInteractivelyTestSuite) TestDestroysRunnerAfterTimeoutAndSignal(
s.mockedExecuteCommandCall.Run(func(args mock.Arguments) {
select {}
})
runnerDestroyed := false
s.runner.onDestroy = func(_ Runner) error {
runnerDestroyed = true
return nil
}
timeLimit := 1
executionRequest := &dto.ExecutionRequest{TimeLimit: timeLimit}
s.runner.cancel = func() {}
s.runner.StoreExecution(defaultExecutionID, executionRequest)
_, _, err := s.runner.ExecuteInteractively(
defaultExecutionID, bytes.NewBuffer(make([]byte, 1)), nil, nil, context.Background())
s.Require().NoError(err)
<-time.After(executionTimeoutGracePeriod + time.Duration(timeLimit)*time.Second + tests.ShortTimeout)
s.manager.AssertCalled(s.T(), "Return", s.runner)
s.manager.AssertNotCalled(s.T(), "Return", s.runner)
s.apiMock.AssertCalled(s.T(), "DeleteJob", s.runner.ID())
s.True(runnerDestroyed)
}
func (s *ExecuteInteractivelyTestSuite) TestResetTimerGetsCalled() {
@ -395,17 +396,6 @@ func (s *UpdateFileSystemTestSuite) readFilesFromTarArchive(tarArchive io.Reader
return files
}
// NewRunner creates a new runner with the provided id and manager.
func NewRunner(id string, manager Accessor) Runner {
var handler DestroyRunnerHandler
if manager != nil {
handler = manager.Return
} else {
handler = func(_ Runner) error { return nil }
}
return NewNomadJob(id, nil, nil, handler)
}
func (s *UpdateFileSystemTestSuite) TestGetFileContentReturnsErrorIfExitCodeIsNotZero() {
s.mockedExecuteCommandCall.RunFn = nil
s.mockedExecuteCommandCall.Return(1, nil)

View File

@ -59,7 +59,8 @@ type Runner interface {
GetFileContent(path string, content http.ResponseWriter, privilegedExecution bool, ctx context.Context) error
// Destroy destroys the Runner in Nomad.
Destroy() error
// Iff local is true, the destruction will not be propagated to external systems.
Destroy(local bool) error
}
// NewContext creates a context containing a runner.

View File

@ -1,4 +1,4 @@
// Code generated by mockery v2.16.0. DO NOT EDIT.
// Code generated by mockery v2.30.1. DO NOT EDIT.
package runner
@ -20,13 +20,13 @@ type RunnerMock struct {
mock.Mock
}
// Destroy provides a mock function with given fields:
func (_m *RunnerMock) Destroy() error {
ret := _m.Called()
// Destroy provides a mock function with given fields: local
func (_m *RunnerMock) Destroy(local bool) error {
ret := _m.Called(local)
var r0 error
if rf, ok := ret.Get(0).(func() error); ok {
r0 = rf()
if rf, ok := ret.Get(0).(func(bool) error); ok {
r0 = rf(local)
} else {
r0 = ret.Error(0)
}
@ -53,6 +53,11 @@ func (_m *RunnerMock) ExecuteInteractively(id string, stdin io.ReadWriter, stdou
ret := _m.Called(id, stdin, stdout, stderr, ctx)
var r0 <-chan ExitInfo
var r1 context.CancelFunc
var r2 error
if rf, ok := ret.Get(0).(func(string, io.ReadWriter, io.Writer, io.Writer, context.Context) (<-chan ExitInfo, context.CancelFunc, error)); ok {
return rf(id, stdin, stdout, stderr, ctx)
}
if rf, ok := ret.Get(0).(func(string, io.ReadWriter, io.Writer, io.Writer, context.Context) <-chan ExitInfo); ok {
r0 = rf(id, stdin, stdout, stderr, ctx)
} else {
@ -61,7 +66,6 @@ func (_m *RunnerMock) ExecuteInteractively(id string, stdin io.ReadWriter, stdou
}
}
var r1 context.CancelFunc
if rf, ok := ret.Get(1).(func(string, io.ReadWriter, io.Writer, io.Writer, context.Context) context.CancelFunc); ok {
r1 = rf(id, stdin, stdout, stderr, ctx)
} else {
@ -70,7 +74,6 @@ func (_m *RunnerMock) ExecuteInteractively(id string, stdin io.ReadWriter, stdou
}
}
var r2 error
if rf, ok := ret.Get(2).(func(string, io.ReadWriter, io.Writer, io.Writer, context.Context) error); ok {
r2 = rf(id, stdin, stdout, stderr, ctx)
} else {
@ -200,13 +203,12 @@ func (_m *RunnerMock) UpdateFileSystem(request *dto.UpdateFileSystemRequest, ctx
return r0
}
type mockConstructorTestingTNewRunnerMock interface {
// NewRunnerMock creates a new instance of RunnerMock. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
// The first argument is typically a *testing.T value.
func NewRunnerMock(t interface {
mock.TestingT
Cleanup(func())
}
// NewRunnerMock creates a new instance of RunnerMock. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations.
func NewRunnerMock(t mockConstructorTestingTNewRunnerMock) *RunnerMock {
}) *RunnerMock {
mock := &RunnerMock{}
mock.Mock.Test(t)