From 6f84a437bbc31d91acc464133bc9d7533967a8d1 Mon Sep 17 00:00:00 2001 From: Maximilian Pass <22845248+mpass99@users.noreply.github.com> Date: Thu, 13 Jun 2024 12:50:15 +0200 Subject: [PATCH] Fix Nomad runner recovery that lead to missing runners when Nomad creates runners faster than the recovery process takes. By starting the Event Stream not after, but with the start of the runner recovery, Poseidon is able to register the runners started in the recovery process. --- cmd/poseidon/main.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/cmd/poseidon/main.go b/cmd/poseidon/main.go index 42a5484..46bfb29 100644 --- a/cmd/poseidon/main.go +++ b/cmd/poseidon/main.go @@ -354,13 +354,21 @@ func createNomadManager(ctx context.Context) (runner.Manager, environment.Manage func synchronizeNomad(ctx context.Context, environmentManager *environment.NomadEnvironmentManager, runnerManager *runner.NomadRunnerManager) { firstRecoveryDone := make(chan struct{}) go environmentManager.KeepEnvironmentsSynced(func(ctx context.Context) error { - runnerManager.Load() + go func() { + // `Load` not only recover existing runners, but also applies the prewarming pool size which creates runners. + // Therefore, the Nomad Event Stream has to be started before. + runnerManager.Load() - select { - case firstRecoveryDone <- struct{}{}: - log.Info("First Recovery Done") - default: - } + select { + case firstRecoveryDone <- struct{}{}: + log.Info("First Recovery Done") + default: + } + }() + + // The Race Condition between the startup of the event stream and the recovery of missing runners is uncritical + // because setting the start time of the stream is the first thing done in `SynchronizeRunners` while `Load` + // first starts an HTTP request for each individual existing runner. if err := runnerManager.SynchronizeRunners(ctx); err != nil { return fmt.Errorf("synchronize runners failed: %w", err)