From 4b2cae0bd1beb712a3c4cc792e29b01ea0a44632 Mon Sep 17 00:00:00 2001 From: Konrad Hanff Date: Thu, 17 Jun 2021 10:16:50 +0200 Subject: [PATCH] Add inactivity timeout for runners. By removing runners after a specified timeout they no longer stay around indefinitely and block Nomads capacities. The timeout can be set individually per runner when requesting the provide route. If it is set to 0, the runner is never removed automatically. The timeout is reset when activity is detected. Currently that is when something gets executed or the filesystem gets modified. --- api/runners.go | 10 ++-- api/websocket.go | 7 +-- nomad/nomad.go | 4 +- runner/manager.go | 13 ++++- runner/runner.go | 116 ++++++++++++++++++++++++++++++++++++++++-- runner/runner_test.go | 79 +++++++++++++++++++++++++++- 6 files changed, 212 insertions(+), 17 deletions(-) diff --git a/api/runners.go b/api/runners.go index 761c541..73104ff 100644 --- a/api/runners.go +++ b/api/runners.go @@ -9,6 +9,7 @@ import ( "gitlab.hpi.de/codeocean/codemoon/poseidon/runner" "net/http" "net/url" + "time" ) const ( @@ -49,16 +50,19 @@ func (r *RunnerController) provide(writer http.ResponseWriter, request *http.Req environmentId := runner.EnvironmentID(runnerRequest.ExecutionEnvironmentId) nextRunner, err := r.manager.Claim(environmentId) if err != nil { - if err == runner.ErrUnknownExecutionEnvironment { + switch err { + case runner.ErrUnknownExecutionEnvironment: writeNotFound(writer, err) - } else if err == runner.ErrNoRunnersAvailable { + case runner.ErrNoRunnersAvailable: log.WithField("environment", environmentId).Warn("No runners available") writeInternalServerError(writer, err, dto.ErrorNomadOverload) - } else { + default: writeInternalServerError(writer, err, dto.ErrorUnknown) } return } + timeout := time.Duration(runnerRequest.InactivityTimeout) * time.Second + nextRunner.SetupTimeout(timeout, nextRunner, r.manager) sendJson(writer, &dto.RunnerResponse{Id: nextRunner.Id()}, http.StatusOK) } diff --git a/api/websocket.go b/api/websocket.go index d05a32b..3cf0f23 100644 --- a/api/websocket.go +++ b/api/websocket.go @@ -186,11 +186,8 @@ func (wp *webSocketProxy) waitForExit(exit <-chan runner.ExitInfo, cancelExecuti return } - if exitInfo.Err == context.DeadlineExceeded { - err := wp.sendToClient(dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout}) - if err != nil { - return - } + if errors.Is(exitInfo.Err, context.DeadlineExceeded) || errors.Is(exitInfo.Err, runner.ErrorRunnerInactivityTimeout) { + _ = wp.sendToClient(dto.WebSocketMessage{Type: dto.WebSocketMetaTimeout}) return } else if exitInfo.Err != nil { errorMessage := "Error executing the request" diff --git a/nomad/nomad.go b/nomad/nomad.go index 4ce5a64..697aae2 100644 --- a/nomad/nomad.go +++ b/nomad/nomad.go @@ -63,8 +63,8 @@ type ExecutorAPI interface { ExecuteCommand(allocationID string, ctx context.Context, command []string, tty bool, stdin io.Reader, stdout, stderr io.Writer) (int, error) - // MarkRunnerAsUsed marks the runner with the given ID as used. - MarkRunnerAsUsed(runnerID string) error + // MarkRunnerAsUsed marks the runner with the given ID as used. It also stores the timeout duration in the metadata. + MarkRunnerAsUsed(runnerID string, duration int) error } // APIClient implements the ExecutorAPI interface and can be used to perform different operations on the real diff --git a/runner/manager.go b/runner/manager.go index c7640e3..f0b86d6 100644 --- a/runner/manager.go +++ b/runner/manager.go @@ -185,6 +185,8 @@ func (m *NomadRunnerManager) Claim(environmentID EnvironmentID) (Runner, error) return nil, fmt.Errorf("can't mark runner as used: %w", err) } + runner.SetupTimeout(time.Duration(duration) * time.Second) + err = m.scaleEnvironment(environmentID) if err != nil { log.WithError(err).WithField("environmentID", environmentID).Error("Couldn't scale environment") @@ -202,6 +204,7 @@ func (m *NomadRunnerManager) Get(runnerID string) (Runner, error) { } func (m *NomadRunnerManager) Return(r Runner) (err error) { + r.StopTimeout() err = m.apiClient.DeleteRunner(r.Id()) if err != nil { return @@ -234,9 +237,15 @@ func (m *NomadRunnerManager) Load() { continue } isUsed := configTaskGroup.Meta[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue - newJob := NewNomadJob(*job.ID, m.apiClient) + newJob := NewNomadJob(*job.ID, m.apiClient, m) if isUsed { m.usedRunners.Add(newJob) + timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey]) + if err != nil { + log.WithError(err).Warn("Error loading timeout from meta values") + } else { + newJob.SetupTimeout(time.Duration(timeout) * time.Second) + } } else { environment.idleRunners.Add(newJob) } @@ -273,7 +282,7 @@ func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation) { job, ok := m.environments.Get(environmentID) if ok { - job.idleRunners.Add(NewNomadJob(alloc.JobID, m.apiClient)) + job.idleRunners.Add(NewNomadJob(alloc.JobID, m.apiClient, m)) } } diff --git a/runner/runner.go b/runner/runner.go index c27b295..239ca74 100644 --- a/runner/runner.go +++ b/runner/runner.go @@ -11,6 +11,7 @@ import ( "gitlab.hpi.de/codeocean/codemoon/poseidon/nomad" "io" "strings" + "sync" "time" ) @@ -26,14 +27,113 @@ const ( ) var ( - ErrorFileCopyFailed = errors.New("file copy failed") + ErrorFileCopyFailed = errors.New("file copy failed") + ErrorRunnerInactivityTimeout = errors.New("runner inactivity timeout exceeded") ) +// InactivityTimer is a wrapper around a timer that is used to delete a a Runner after some time of inactivity. +type InactivityTimer interface { + // SetupTimeout starts the timeout after a runner gets deleted. + SetupTimeout(duration time.Duration) + + // ResetTimeout resets the current timeout so that the runner gets deleted after the time set in Setup from now. + // It does not make an already expired timer run again. + ResetTimeout() + + // StopTimeout stops the timeout but does not remove the runner. + StopTimeout() + + // TimeoutPassed returns true if the timeout expired and false otherwise. + TimeoutPassed() bool +} + +type TimerState uint8 + +const ( + TimerInactive TimerState = 0 + TimerRunning TimerState = 1 + TimerExpired TimerState = 2 +) + +type InactivityTimerImplementation struct { + timer *time.Timer + duration time.Duration + state TimerState + runner Runner + manager Manager + sync.Mutex +} + +func NewInactivityTimer(runner Runner, manager Manager) InactivityTimer { + return &InactivityTimerImplementation{ + state: TimerInactive, + runner: runner, + manager: manager, + } +} + +func (t *InactivityTimerImplementation) SetupTimeout(duration time.Duration) { + t.Lock() + defer t.Unlock() + // Stop old timer if present. + if t.timer != nil { + t.timer.Stop() + } + if duration == 0 { + t.state = TimerInactive + return + } + t.state = TimerRunning + t.duration = duration + + t.timer = time.AfterFunc(duration, func() { + t.Lock() + t.state = TimerExpired + // The timer must be unlocked here already in order to avoid a deadlock with the call to StopTimout in Manager.Return. + t.Unlock() + err := t.manager.Return(t.runner) + if err != nil { + log.WithError(err).WithField("id", t.runner.Id()).Warn("Returning runner after inactivity caused an error") + } else { + log.WithField("id", t.runner.Id()).Info("Returning runner due to inactivity timeout") + } + }) +} + +func (t *InactivityTimerImplementation) ResetTimeout() { + t.Lock() + defer t.Unlock() + if t.state != TimerRunning { + // The timer has already expired or been stopped. We don't want to restart it. + return + } + if t.timer.Stop() { + t.timer.Reset(t.duration) + } else { + log.Error("Timer is in state running but stopped. This should never happen") + } +} + +func (t *InactivityTimerImplementation) StopTimeout() { + t.Lock() + defer t.Unlock() + if t.state != TimerRunning { + return + } + t.timer.Stop() + t.state = TimerInactive +} + +func (t *InactivityTimerImplementation) TimeoutPassed() bool { + return t.state == TimerExpired +} + type Runner interface { // Id returns the id of the runner. Id() string ExecutionStorage + InactivityTimer // ExecuteInteractively runs the given execution request and forwards from and to the given reader and writers. // An ExitInfo is sent to the exit channel on command completion. @@ -53,17 +153,20 @@ type Runner interface { // NomadJob is an abstraction to communicate with Nomad environments. type NomadJob struct { ExecutionStorage + InactivityTimer id string api nomad.ExecutorAPI } // NewNomadJob creates a new NomadJob with the provided id. -func NewNomadJob(id string, apiClient nomad.ExecutorAPI) *NomadJob { - return &NomadJob{ +func NewNomadJob(id string, apiClient nomad.ExecutorAPI, manager Manager) *NomadJob { + job := &NomadJob{ id: id, api: apiClient, ExecutionStorage: NewLocalExecutionStorage(), } + job.InactivityTimer = NewInactivityTimer(job, manager) + return job } func (r *NomadJob) Id() string { @@ -80,6 +183,8 @@ func (r *NomadJob) ExecuteInteractively( stdin io.Reader, stdout, stderr io.Writer, ) (<-chan ExitInfo, context.CancelFunc) { + r.ResetTimeout() + command := request.FullCommand() var ctx context.Context var cancel context.CancelFunc @@ -91,6 +196,9 @@ func (r *NomadJob) ExecuteInteractively( exit := make(chan ExitInfo) go func() { exitCode, err := r.api.ExecuteCommand(r.id, ctx, command, true, stdin, stdout, stderr) + if err == nil && r.TimeoutPassed() { + err = ErrorRunnerInactivityTimeout + } exit <- ExitInfo{uint8(exitCode), err} close(exit) }() @@ -98,6 +206,8 @@ func (r *NomadJob) ExecuteInteractively( } func (r *NomadJob) UpdateFileSystem(copyRequest *dto.UpdateFileSystemRequest) error { + r.ResetTimeout() + var tarBuffer bytes.Buffer if err := createTarArchiveForFiles(copyRequest.Copy, &tarBuffer); err != nil { return err diff --git a/runner/runner_test.go b/runner/runner_test.go index 5a268bf..a4d66c7 100644 --- a/runner/runner_test.go +++ b/runner/runner_test.go @@ -80,8 +80,8 @@ func TestExecuteCallsAPI(t *testing.T) { request := &dto.ExecutionRequest{Command: "echo 'Hello World!'"} runner.ExecuteInteractively(request, nil, nil, nil) - <-time.After(50 * time.Millisecond) - apiMock.AssertCalled(t, "ExecuteCommand", tests.DefaultRunnerID, mock.Anything, request.FullCommand(), true, mock.Anything, mock.Anything, mock.Anything) + time.Sleep(tests.ShortTimeout) + s.apiMock.AssertCalled(s.T(), "ExecuteCommand", tests.DefaultRunnerID, mock.Anything, request.FullCommand(), true, mock.Anything, mock.Anything, mock.Anything) } func TestExecuteReturnsAfterTimeout(t *testing.T) { @@ -252,6 +252,81 @@ func (s *UpdateFileSystemTestSuite) readFilesFromTarArchive(tarArchive io.Reader return files } +func TestInactivityTimerTestSuite(t *testing.T) { + suite.Run(t, new(InactivityTimerTestSuite)) +} + +type InactivityTimerTestSuite struct { + suite.Suite + runner Runner + manager *ManagerMock + returned chan bool +} + +func (s *InactivityTimerTestSuite) SetupTest() { + s.returned = make(chan bool) + s.runner = NewRunner(tests.DefaultRunnerID) + s.manager = &ManagerMock{} + s.manager.On("Return", mock.Anything).Run(func(_ mock.Arguments) { + s.returned <- true + }).Return(nil) + + s.runner.SetupTimeout(tests.ShortTimeout, s.runner, s.manager) +} + +func (s *InactivityTimerTestSuite) TearDownTest() { + s.runner.StopTimeout() +} + +func (s *InactivityTimerTestSuite) TestRunnerIsReturnedAfterTimeout() { + s.True(tests.ChannelReceivesSomething(s.returned, 2*tests.ShortTimeout)) +} + +func (s *InactivityTimerTestSuite) TestRunnerIsNotReturnedBeforeTimeout() { + s.False(tests.ChannelReceivesSomething(s.returned, tests.ShortTimeout/2)) +} + +func (s *InactivityTimerTestSuite) TestResetTimeoutExtendsTheDeadline() { + time.Sleep(2 * tests.ShortTimeout / 4) + s.runner.ResetTimeout() + s.False(tests.ChannelReceivesSomething(s.returned, 3*tests.ShortTimeout/4), + "Because of the reset, the timeout should not be reached by now.") + s.True(tests.ChannelReceivesSomething(s.returned, 5*tests.ShortTimeout/4), + "After reset, the timout should be reached by now.") +} + +func (s *InactivityTimerTestSuite) TestStopTimeoutStopsTimeout() { + s.runner.StopTimeout() + s.False(tests.ChannelReceivesSomething(s.returned, 2*tests.ShortTimeout)) +} + +func (s *InactivityTimerTestSuite) TestTimeoutPassedReturnsFalseBeforeDeadline() { + s.False(s.runner.TimeoutPassed()) +} + +func (s *InactivityTimerTestSuite) TestTimeoutPassedReturnsTrueAfterDeadline() { + time.Sleep(2 * tests.ShortTimeout) + s.True(s.runner.TimeoutPassed()) +} + +func (s *InactivityTimerTestSuite) TestTimerIsNotResetAfterDeadline() { + time.Sleep(2 * tests.ShortTimeout) + <-s.returned + s.runner.ResetTimeout() + s.False(tests.ChannelReceivesSomething(s.returned, 2*tests.ShortTimeout)) +} + +func (s *InactivityTimerTestSuite) TestSetupTimoutStopsOldTimout() { + s.runner.SetupTimeout(3*tests.ShortTimeout, s.runner, s.manager) + s.False(tests.ChannelReceivesSomething(s.returned, 2*tests.ShortTimeout)) + s.True(tests.ChannelReceivesSomething(s.returned, 2*tests.ShortTimeout)) +} + +func (s *InactivityTimerTestSuite) TestTimerIsInactiveWhenDurationIsZero() { + s.runner.SetupTimeout(0, s.runner, s.manager) + s.False(tests.ChannelReceivesSomething(s.returned, tests.ShortTimeout)) +} + // NewRunner creates a new runner with the provided id. func NewRunner(id string) Runner { return NewNomadJob(id, nil)