Fix Nomad event race condition

that was triggered by simultaneous deletion of the runner due to inactivity, and the allocation being rescheduled due to a lost node. It led to the allocation first being rescheduled, and then being stopped. This caused an unexpected stopping of a pending runner on a lower level. To fix it we added communication from the upper level that the stop of the job was expected.
2023-06-08 10:23:05 +01:00
parent b620d0fad7
commit f031219cb8
3 changed files with 44 additions and 10 deletions
--- a/internal/runner/nomad_manager.go
+++ b/internal/runner/nomad_manager.go
@ -177,22 +177,26 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
 	monitoring.WriteInfluxPoint(p)
 }

-func (m *NomadRunnerManager) onAllocationStopped(runnerID string) {
+func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
 	log.WithField("id", runnerID).Debug("Runner stopped")

 	if nomad.IsEnvironmentTemplateID(runnerID) {
-		return
+		return false
 	}

 	environmentID, err := nomad.EnvironmentIDFromRunnerID(runnerID)
 	if err != nil {
 		log.WithError(err).Warn("Stopped allocation can not be handled")
-		return
+		return false
 	}

+	_, stillActive := m.usedRunners.Get(runnerID)
 	m.usedRunners.Delete(runnerID)
+
 	environment, ok := m.environments.Get(environmentID.ToString())
 	if ok {
 		environment.DeleteRunner(runnerID)
 	}
+
+	return !stillActive
 }