Refactor Nomad Recovery

from an approach that loaded the runners only once at the startup
to a method that will be repeated i.e. if the Nomad Event Stream connection interrupts.
This commit is contained in:
Maximilian Paß
2023-10-23 14:36:14 +02:00
committed by Sebastian Serth
parent b2898f9183
commit 6b69a2d732
22 changed files with 211 additions and 120 deletions

View File

@ -41,9 +41,12 @@ var (
ErrorUnknownExecution = errors.New("unknown execution")
ErrorFileCopyFailed = errors.New("file copy failed")
ErrFileNotFound = errors.New("file not found or insufficient permissions")
ErrLocalDestruction DestroyReason = nomad.ErrorLocalDestruction
ErrOOMKilled DestroyReason = nomad.ErrorOOMKilled
ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
ErrCannotStopExecution DestroyReason = errors.New("the execution did not stop after SIGQUIT")
ErrDestroyedAndReplaced DestroyReason = fmt.Errorf("the runner will be destroyed and replaced: %w", ErrLocalDestruction)
ErrEnvironmentUpdated DestroyReason = errors.New("the environment will be destroyed and updated")
)
// NomadJob is an abstraction to communicate with Nomad environments.
@ -258,10 +261,7 @@ func (r *NomadJob) Destroy(reason DestroyReason) (err error) {
}
}
// local determines if a reason is present that the runner should only be removed locally (without requesting Nomad).
local := errors.Is(reason, nomad.ErrorAllocationRescheduled) ||
errors.Is(reason, ErrOOMKilled)
if local {
if errors.Is(reason, ErrLocalDestruction) {
log.WithContext(r.ctx).Debug("Runner destroyed locally")
return nil
}