Fix Nomad runner recovery

that lead to missing runners when Nomad creates runners faster than the recovery process takes. By starting the Event Stream not after, but with the start of the runner recovery, Poseidon is able to register the runners started in the recovery process.
2024-06-13 12:50:15 +02:00
parent 74402535f8
commit 6f84a437bb
1 changed files with 14 additions and 6 deletions
--- a/cmd/poseidon/main.go
+++ b/cmd/poseidon/main.go
@ -354,13 +354,21 @@ func createNomadManager(ctx context.Context) (runner.Manager, environment.Manage
 func synchronizeNomad(ctx context.Context, environmentManager *environment.NomadEnvironmentManager, runnerManager *runner.NomadRunnerManager) {
 	firstRecoveryDone := make(chan struct{})
 	go environmentManager.KeepEnvironmentsSynced(func(ctx context.Context) error {
-		runnerManager.Load()
+		go func() {
+			// `Load` not only recover existing runners, but also applies the prewarming pool size which creates runners.
+			// Therefore, the Nomad Event Stream has to be started before.
+			runnerManager.Load()

-		select {
-		case firstRecoveryDone <- struct{}{}:
-			log.Info("First Recovery Done")
-		default:
-		}
+			select {
+			case firstRecoveryDone <- struct{}{}:
+				log.Info("First Recovery Done")
+			default:
+			}
+		}()
+
+		// The Race Condition between the startup of the event stream and the recovery of missing runners is uncritical
+		// because setting the start time of the stream is the first thing done in `SynchronizeRunners` while `Load`
+		// first starts an HTTP request for each individual existing runner.

 		if err := runnerManager.SynchronizeRunners(ctx); err != nil {
 			return fmt.Errorf("synchronize runners failed: %w", err)