Files
poseidon/internal/runner/nomad_manager.go
Maximilian Paß 6b69a2d732 Refactor Nomad Recovery
from an approach that loaded the runners only once at the startup
to a method that will be repeated i.e. if the Nomad Event Stream connection interrupts.
2023-10-31 15:49:56 +01:00

249 lines
8.3 KiB
Go

package runner
import (
"context"
"errors"
"fmt"
nomadApi "github.com/hashicorp/nomad/api"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/logging"
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util"
"github.com/sirupsen/logrus"
"strconv"
"time"
)
var (
log = logging.GetLogger("runner")
ErrUnknownExecutionEnvironment = errors.New("execution environment not found")
ErrNoRunnersAvailable = errors.New("no runners available for this execution environment")
ErrRunnerNotFound = errors.New("no runner found with this id")
)
type NomadRunnerManager struct {
*AbstractManager
apiClient nomad.ExecutorAPI
}
// NewNomadRunnerManager creates a new runner manager that keeps track of all runners.
// KeepRunnersSynced has to be started separately.
func NewNomadRunnerManager(apiClient nomad.ExecutorAPI, ctx context.Context) *NomadRunnerManager {
return &NomadRunnerManager{NewAbstractManager(ctx), apiClient}
}
func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int) (Runner, error) {
environment, ok := m.environments.Get(environmentID.ToString())
if !ok {
return nil, ErrUnknownExecutionEnvironment
}
runner, ok := environment.Sample()
if !ok {
return nil, ErrNoRunnersAvailable
}
m.usedRunners.Add(runner.ID(), runner)
go m.markRunnerAsUsed(runner, duration)
runner.SetupTimeout(time.Duration(duration) * time.Second)
return runner, nil
}
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
err := util.RetryExponential(func() (err error) {
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
err = fmt.Errorf("cannot mark runner as used: %w", err)
}
return
})
if err != nil {
log.WithError(err).WithField(dto.KeyRunnerID, runner.ID()).Error("cannot mark runner as used")
err := m.Return(runner)
if err != nil {
log.WithError(err).WithField(dto.KeyRunnerID, runner.ID()).Error("can't mark runner as used and can't return runner")
}
}
}
func (m *NomadRunnerManager) Return(r Runner) error {
m.usedRunners.Delete(r.ID())
err := r.Destroy(ErrDestroyedByAPIRequest)
if err != nil {
err = fmt.Errorf("cannot return runner: %w", err)
}
return err
}
// SynchronizeRunners loads all runners and keeps them synchronized (without a retry mechanism).
func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error {
// Load Runners
if err := m.load(); err != nil {
return fmt.Errorf("failed loading runners: %w", err)
}
// Watch for changes regarding the existing or new runners.
err := m.apiClient.WatchEventStream(ctx,
&nomad.AllocationProcessing{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped})
if err != nil && !(errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)) {
err = fmt.Errorf("nomad Event Stream failed!: %w", err)
}
return err
}
// Load recovers all runners for all existing environments.
func (m *NomadRunnerManager) load() error {
newUsedRunners := storage.NewLocalStorage[Runner]()
for _, environment := range m.environments.List() {
environmentLogger := log.WithField(dto.KeyEnvironmentID, environment.ID().ToString())
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID())
if err != nil {
return fmt.Errorf("failed fetching the runner jobs: %w", err)
}
for _, job := range runnerJobs {
m.loadSingleJob(job, environmentLogger, environment, newUsedRunners)
}
err = environment.ApplyPrewarmingPoolSize()
if err != nil {
return fmt.Errorf("couldn't scale environment: %w", err)
}
}
m.updateUsedRunners(newUsedRunners)
return nil
}
func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger *logrus.Entry,
environment ExecutionEnvironment, newUsedRunners storage.Storage[Runner]) {
configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName)
if configTaskGroup == nil {
environmentLogger.Warnf("Couldn't find config task group in job %s, skipping ...", *job.ID)
return
}
isUsed := configTaskGroup.Meta[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue
portMappings, err := m.apiClient.LoadRunnerPortMappings(*job.ID)
if err != nil {
environmentLogger.WithError(err).Warn("Error loading runner portMappings, skipping ...")
return
}
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.onRunnerDestroyed)
log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner")
if isUsed {
newUsedRunners.Add(newJob.ID(), newJob)
timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey])
if err != nil {
environmentLogger.WithError(err).Warn("Error loading timeout from meta values")
} else {
newJob.SetupTimeout(time.Duration(timeout) * time.Second)
}
} else {
environment.AddRunner(newJob)
}
}
func (m *NomadRunnerManager) updateUsedRunners(newUsedRunners storage.Storage[Runner]) {
for _, r := range m.usedRunners.List() {
var reason DestroyReason
if _, ok := newUsedRunners.Get(r.ID()); ok {
reason = ErrDestroyedAndReplaced
} else {
reason = ErrLocalDestruction
log.WithError(reason).WithField(dto.KeyRunnerID, r.ID()).Warn("Local runner cannot be recovered")
}
m.usedRunners.Delete(r.ID())
if err := r.Destroy(reason); err != nil {
log.WithError(err).WithField(dto.KeyRunnerID, r.ID()).Warn("failed to destroy runner locally")
}
}
for _, r := range newUsedRunners.List() {
m.usedRunners.Add(r.ID(), r)
}
}
// onAllocationAdded is the callback for when Nomad started an allocation.
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, startup time.Duration) {
log.WithField(dto.KeyRunnerID, alloc.JobID).WithField("startupDuration", startup).Debug("Runner started")
if nomad.IsEnvironmentTemplateID(alloc.JobID) {
return
}
if _, ok := m.usedRunners.Get(alloc.JobID); ok {
log.WithField(dto.KeyRunnerID, alloc.JobID).WithField("states", alloc.TaskStates).
Error("Started Runner is already in use")
return
}
environmentID, err := nomad.EnvironmentIDFromRunnerID(alloc.JobID)
if err != nil {
log.WithError(err).Warn("Allocation could not be added")
return
}
environment, ok := m.environments.Get(environmentID.ToString())
if ok {
var mappedPorts []nomadApi.PortMapping
if alloc.AllocatedResources != nil {
mappedPorts = alloc.AllocatedResources.Shared.Ports
}
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m.onRunnerDestroyed))
monitorAllocationStartupDuration(startup, alloc.JobID, environmentID)
}
}
func monitorAllocationStartupDuration(startup time.Duration, runnerID string, environmentID dto.EnvironmentID) {
p := influxdb2.NewPointWithMeasurement(monitoring.MeasurementIdleRunnerNomad)
p.AddField(monitoring.InfluxKeyStartupDuration, startup.Nanoseconds())
p.AddTag(monitoring.InfluxKeyEnvironmentID, environmentID.ToString())
p.AddTag(monitoring.InfluxKeyRunnerID, runnerID)
monitoring.WriteInfluxPoint(p)
}
// onAllocationStopped is the callback for when Nomad stopped an allocation.
func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error) (alreadyRemoved bool) {
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
if nomad.IsEnvironmentTemplateID(runnerID) {
return false
}
environmentID, err := nomad.EnvironmentIDFromRunnerID(runnerID)
if err != nil {
log.WithError(err).Warn("Stopped allocation can not be handled")
return false
}
r, stillActive := m.usedRunners.Get(runnerID)
if stillActive {
m.usedRunners.Delete(runnerID)
if err := r.Destroy(reason); err != nil {
log.WithError(err).Warn("Runner of stopped allocation cannot be destroyed")
}
}
environment, ok := m.environments.Get(environmentID.ToString())
if ok {
stillActive = stillActive || environment.DeleteRunner(runnerID)
}
return !stillActive
}
// onRunnerDestroyed is the callback when the runner destroys itself.
// The main use of this callback is to remove the runner from the used runners, when its timeout exceeds.
func (m *NomadRunnerManager) onRunnerDestroyed(r Runner) error {
m.usedRunners.Delete(r.ID())
environment, ok := m.environments.Get(r.Environment().ToString())
if ok {
environment.DeleteRunner(r.ID())
}
return nil
}