Destroy runner on allocation stopped

Destroying the runner when Nomad informs us about its allocation being stopped, fixes the error of executions running into their timeout even if the allocation was stopped long ago.
This commit is contained in:
Maximilian Paß
2023-06-26 17:03:52 +01:00
committed by Sebastian Serth
parent 40a5f2eca6
commit bfb5977d24
10 changed files with 104 additions and 74 deletions

View File

@ -71,17 +71,11 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int
func (m *NomadRunnerManager) Return(r Runner) error {
m.usedRunners.Delete(r.ID())
r.StopTimeout()
err := util.RetryExponential(time.Second, func() (err error) {
if err = m.apiClient.DeleteJob(r.ID()); err != nil {
err = fmt.Errorf("error deleting runner in Nomad: %w", err)
}
return
})
err := r.Destroy(false)
if err != nil {
return fmt.Errorf("%w", err)
err = fmt.Errorf("cannot return runner: %w", err)
}
return nil
return err
}
func (m *NomadRunnerManager) Load() {
@ -114,7 +108,7 @@ func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger
environmentLogger.WithError(err).Warn("Error loading runner portMappings")
return
}
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.Return)
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.onRunnerDestroyed)
log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner")
if isUsed {
m.usedRunners.Add(newJob.ID(), newJob)
@ -140,6 +134,7 @@ func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
}
}
// onAllocationAdded is the callback for when Nomad started an allocation.
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, startup time.Duration) {
log.WithField(dto.KeyRunnerID, alloc.JobID).WithField("startupDuration", startup).Debug("Runner started")
@ -165,7 +160,7 @@ func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, start
if alloc.AllocatedResources != nil {
mappedPorts = alloc.AllocatedResources.Shared.Ports
}
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m.Return))
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m.onRunnerDestroyed))
monitorAllocationStartupDuration(startup, alloc.JobID, environmentID)
}
}
@ -178,6 +173,7 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
monitoring.WriteInfluxPoint(p)
}
// onAllocationStopped is the callback for when Nomad stopped an allocation.
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
@ -193,8 +189,10 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemove
r, stillActive := m.usedRunners.Get(runnerID)
if stillActive {
r.StopTimeout()
m.usedRunners.Delete(runnerID)
if err := r.Destroy(true); err != nil {
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
}
}
environment, ok := m.environments.Get(environmentID.ToString())
@ -204,3 +202,15 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemove
return !stillActive
}
// onRunnerDestroyed is the callback when the runner destroys itself.
// The main use of this callback is to remove the runner from the used runners, when its timeout exceeds.
func (m *NomadRunnerManager) onRunnerDestroyed(r Runner) error {
m.usedRunners.Delete(r.ID())
environment, ok := m.environments.Get(r.Environment().ToString())
if ok {
environment.DeleteRunner(r.ID())
}
return nil
}