Fix not canceling monitoring events for removed environments

and runners.
2022-10-13 22:17:45 +01:00
parent 5d54b0f786
commit 7119f3e012
9 changed files with 59 additions and 31 deletions
--- a/internal/runner/abstract_manager.go
+++ b/internal/runner/abstract_manager.go
@@ -1,6 +1,7 @@
 package runner

 import (
+	"context"
 	"errors"
 	"fmt"
 	"github.com/influxdata/influxdb-client-go/v2/api/write"
@@ -22,12 +23,13 @@ type AbstractManager struct {
 }

 // NewAbstractManager creates a new abstract runner manager that keeps track of all runners of one kind.
+// Since this manager is currently directly bound to the lifespan of Poseidon, it does not need a context cancel.
 func NewAbstractManager() *AbstractManager {
 	return &AbstractManager{
 		environments: storage.NewMonitoredLocalStorage[ExecutionEnvironment](
-			monitoring.MeasurementEnvironments, monitorEnvironmentData, 0),
+			monitoring.MeasurementEnvironments, monitorEnvironmentData, 0, context.Background()),
 		usedRunners: storage.NewMonitoredLocalStorage[Runner](
-			monitoring.MeasurementUsedRunner, MonitorRunnersEnvironmentID, time.Hour),
+			monitoring.MeasurementUsedRunner, MonitorRunnersEnvironmentID, time.Hour, context.Background()),
 	}
 }

--- a/internal/runner/aws_runner.go
+++ b/internal/runner/aws_runner.go
@@ -37,6 +37,8 @@ type AWSFunctionWorkload struct {
 	runningExecutions map[execution.ID]context.CancelFunc
 	onDestroy         DestroyRunnerHandler
 	environment       ExecutionEnvironment
+	ctx               context.Context
+	cancel            context.CancelFunc
 }

 // NewAWSFunctionWorkload creates a new AWSFunctionWorkload with the provided id.
@@ -47,15 +49,18 @@ func NewAWSFunctionWorkload(
 		return nil, fmt.Errorf("failed generating runner id: %w", err)
 	}

+	ctx, cancel := context.WithCancel(context.Background())
 	workload := &AWSFunctionWorkload{
 		id:                newUUID.String(),
 		fs:                make(map[dto.FilePath][]byte),
 		runningExecutions: make(map[execution.ID]context.CancelFunc),
 		onDestroy:         onDestroy,
 		environment:       environment,
+		ctx:               ctx,
+		cancel:            cancel,
 	}
 	workload.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
-		monitoring.MeasurementExecutionsAWS, monitorExecutionsRunnerID(environment.ID(), workload.id), time.Minute)
+		monitoring.MeasurementExecutionsAWS, monitorExecutionsRunnerID(environment.ID(), workload.id), time.Minute, ctx)
 	workload.InactivityTimer = NewInactivityTimer(workload, func(_ Runner) error {
 		return workload.Destroy()
 	})
@@ -92,7 +97,7 @@ func (w *AWSFunctionWorkload) ExecuteInteractively(id string, _ io.ReadWriter, s
 	}
 	hideEnvironmentVariables(request, "AWS")
 	request.PrivilegedExecution = true // AWS does not support multiple users at this moment.
-	command, ctx, cancel := prepareExecution(request)
+	command, ctx, cancel := prepareExecution(request, w.ctx)
 	exitInternal := make(chan ExitInfo)
 	exit := make(chan ExitInfo, 1)

@@ -131,9 +136,7 @@ func (w *AWSFunctionWorkload) GetFileContent(_ string, _ http.ResponseWriter, _
 }

 func (w *AWSFunctionWorkload) Destroy() error {
-	for _, cancel := range w.runningExecutions {
-		cancel()
-	}
+	w.cancel()
 	if err := w.onDestroy(w); err != nil {
 		return fmt.Errorf("error while destroying aws runner: %w", err)
 	}
--- a/internal/runner/nomad_runner.go
+++ b/internal/runner/nomad_runner.go
@@ -47,6 +47,8 @@ type NomadJob struct {
 	portMappings []nomadApi.PortMapping
 	api          nomad.ExecutorAPI
 	onDestroy    DestroyRunnerHandler
+	ctx          context.Context
+	cancel       context.CancelFunc
 }

 // NewNomadJob creates a new NomadJob with the provided id.
@@ -55,14 +57,17 @@ type NomadJob struct {
 func NewNomadJob(id string, portMappings []nomadApi.PortMapping,
 	apiClient nomad.ExecutorAPI, onDestroy DestroyRunnerHandler,
 ) *NomadJob {
+	ctx, cancel := context.WithCancel(context.Background())
 	job := &NomadJob{
 		id:           id,
 		portMappings: portMappings,
 		api:          apiClient,
 		onDestroy:    onDestroy,
+		ctx:          ctx,
+		cancel:       cancel,
 	}
 	job.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
-		monitoring.MeasurementExecutionsNomad, monitorExecutionsRunnerID(job.Environment(), id), time.Minute)
+		monitoring.MeasurementExecutionsNomad, monitorExecutionsRunnerID(job.Environment(), id), time.Minute, ctx)
 	job.InactivityTimer = NewInactivityTimer(job, onDestroy)
 	return job
 }
@@ -111,10 +116,10 @@ func (r *NomadJob) ExecuteInteractively(

 	r.ResetTimeout()

-	command, ctx, cancel := prepareExecution(request)
+	command, ctx, cancel := prepareExecution(request, r.ctx)
 	exitInternal := make(chan ExitInfo)
 	exit := make(chan ExitInfo, 1)
-	ctxExecute, cancelExecute := context.WithCancel(context.Background())
+	ctxExecute, cancelExecute := context.WithCancel(r.ctx)

 	go r.executeCommand(ctxExecute, command, request.PrivilegedExecution, stdin, stdout, stderr, exitInternal)
 	go r.handleExitOrContextDone(ctx, cancelExecute, exitInternal, exit, stdin)
@@ -203,20 +208,21 @@ func (r *NomadJob) GetFileContent(
 }

 func (r *NomadJob) Destroy() error {
+	r.cancel()
 	if err := r.onDestroy(r); err != nil {
 		return fmt.Errorf("error while destroying runner: %w", err)
 	}
 	return nil
 }

-func prepareExecution(request *dto.ExecutionRequest) (
+func prepareExecution(request *dto.ExecutionRequest, environmentCtx context.Context) (
 	command []string, ctx context.Context, cancel context.CancelFunc,
 ) {
 	command = request.FullCommand()
 	if request.TimeLimit == 0 {
-		ctx, cancel = context.WithCancel(context.Background())
+		ctx, cancel = context.WithCancel(environmentCtx)
 	} else {
-		ctx, cancel = context.WithTimeout(context.Background(), time.Duration(request.TimeLimit)*time.Second)
+		ctx, cancel = context.WithTimeout(environmentCtx, time.Duration(request.TimeLimit)*time.Second)
 	}
 	return command, ctx, cancel
 }
--- a/internal/runner/nomad_runner_test.go
+++ b/internal/runner/nomad_runner_test.go
@@ -127,6 +127,7 @@ func (s *ExecuteInteractivelyTestSuite) SetupTest() {
 		id:              tests.DefaultRunnerID,
 		api:             s.apiMock,
 		onDestroy:       s.manager.Return,
+		ctx:             context.Background(),
 	}
 }

@@ -207,6 +208,7 @@ func (s *ExecuteInteractivelyTestSuite) TestDestroysRunnerAfterTimeoutAndSignal(
 	})
 	timeLimit := 1
 	executionRequest := &dto.ExecutionRequest{TimeLimit: timeLimit}
+	s.runner.cancel = func() {}
 	s.runner.StoreExecution(defaultExecutionID, executionRequest)
 	_, _, err := s.runner.ExecuteInteractively(defaultExecutionID, bytes.NewBuffer(make([]byte, 1)), nil, nil)
 	s.Require().NoError(err)