Fix Nomad event race condition
that was triggered by simultaneous deletion of the runner due to inactivity, and the allocation being rescheduled due to a lost node. It led to the allocation first being rescheduled, and then being stopped. This caused an unexpected stopping of a pending runner on a lower level. To fix it we added communication from the upper level that the stop of the job was expected.
This commit is contained in:

committed by
Sebastian Serth

parent
b620d0fad7
commit
f031219cb8
@ -177,22 +177,26 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
|
||||
monitoring.WriteInfluxPoint(p)
|
||||
}
|
||||
|
||||
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) {
|
||||
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
|
||||
log.WithField("id", runnerID).Debug("Runner stopped")
|
||||
|
||||
if nomad.IsEnvironmentTemplateID(runnerID) {
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
environmentID, err := nomad.EnvironmentIDFromRunnerID(runnerID)
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("Stopped allocation can not be handled")
|
||||
return
|
||||
return false
|
||||
}
|
||||
|
||||
_, stillActive := m.usedRunners.Get(runnerID)
|
||||
m.usedRunners.Delete(runnerID)
|
||||
|
||||
environment, ok := m.environments.Get(environmentID.ToString())
|
||||
if ok {
|
||||
environment.DeleteRunner(runnerID)
|
||||
}
|
||||
|
||||
return !stillActive
|
||||
}
|
||||
|
Reference in New Issue
Block a user