Fix Nomad event race condition

that was triggered by simultaneous deletion of the runner due to inactivity, and the allocation being rescheduled due to a lost node.
It led to the allocation first being rescheduled, and then being stopped. This caused an unexpected stopping of a pending runner on a lower level.
To fix it we added communication from the upper level that the stop of the job was expected.
This commit is contained in:
Maximilian Paß
2023-06-08 10:23:05 +01:00
committed by Sebastian Serth
parent b620d0fad7
commit f031219cb8
3 changed files with 44 additions and 10 deletions

View File

@ -177,22 +177,26 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
monitoring.WriteInfluxPoint(p)
}
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) {
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
log.WithField("id", runnerID).Debug("Runner stopped")
if nomad.IsEnvironmentTemplateID(runnerID) {
return
return false
}
environmentID, err := nomad.EnvironmentIDFromRunnerID(runnerID)
if err != nil {
log.WithError(err).Warn("Stopped allocation can not be handled")
return
return false
}
_, stillActive := m.usedRunners.Get(runnerID)
m.usedRunners.Delete(runnerID)
environment, ok := m.environments.Get(environmentID.ToString())
if ok {
environment.DeleteRunner(runnerID)
}
return !stillActive
}