package runner import ( "context" "fmt" "github.com/openHPI/poseidon/internal/kubernetes" "github.com/openHPI/poseidon/internal/nomad" "github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/storage" "github.com/openHPI/poseidon/pkg/util" appv1 "k8s.io/api/apps/v1" "strconv" "time" ) type KubernetesRunnerManager struct { *AbstractManager apiClient kubernetes.ExecutorAPI reloadingEnvironment storage.Storage[*alertData] } func NewKubernetesRunnerManager(apiClient *kubernetes.ExecutorAPI, ctx context.Context) *KubernetesRunnerManager { return &KubernetesRunnerManager{ AbstractManager: NewAbstractManager(ctx), apiClient: *apiClient, reloadingEnvironment: storage.NewLocalStorage[*alertData](), } } // Load recovers all runners for all existing environments. func (k *KubernetesRunnerManager) Load() { log.Info("Loading runners") newUsedRunners := storage.NewLocalStorage[Runner]() for _, environment := range k.ListEnvironments() { usedRunners, err := k.loadEnvironment(environment) if err != nil { log.WithError(err).WithField(dto.KeyEnvironmentID, environment.ID().ToString()). Warn("Failed loading environment. Skipping...") continue } for _, r := range usedRunners.List() { newUsedRunners.Add(r.ID(), r) } } // TODO MISSING IMPLEMENTATION //k.updateUsedRunners(newUsedRunners, true) } func (k *KubernetesRunnerManager) loadEnvironment(environment ExecutionEnvironment) (used storage.Storage[Runner], err error) { used = storage.NewLocalStorage[Runner]() runnerJobs, err := k.apiClient.LoadRunnerJobs(environment.ID()) if err != nil { return nil, fmt.Errorf("failed fetching the runner jobs: %w", err) } for _, job := range runnerJobs { r, isUsed, err := k.loadSingleJob(job, environment) if err != nil { log.WithError(err).WithField(dto.KeyEnvironmentID, environment.ID().ToString()). WithField("used", isUsed).Warn("Failed loading job. Skipping...") continue } else if isUsed { used.Add(r.ID(), r) } } err = environment.ApplyPrewarmingPoolSize() if err != nil { return used, fmt.Errorf("couldn't scale environment: %w", err) } return used, nil } func (k *KubernetesRunnerManager) loadSingleJob(deployment *appv1.Deployment, environment ExecutionEnvironment) (r Runner, isUsed bool, err error) { configTaskGroup := deployment.Spec.Template if err != nil { return nil, false, fmt.Errorf("%w, %s", nomad.ErrorMissingTaskGroup, deployment.Name) } isUsed = configTaskGroup.Annotations[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue portMappings, err := k.apiClient.LoadRunnerPortMappings(deployment.Name) if err != nil { return nil, false, fmt.Errorf("error loading runner portMappings: %w", err) } newJob := NewKubernetesDeployment(deployment.Name, portMappings, k.apiClient, k.onRunnerDestroyed) log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner") if isUsed { timeout, err := strconv.Atoi(configTaskGroup.ObjectMeta.Annotations[nomad.ConfigMetaTimeoutKey]) if err != nil { log.WithField(dto.KeyRunnerID, newJob.ID()).WithError(err).Warn("failed loading timeout from meta values") timeout = int(nomad.RunnerTimeoutFallback.Seconds()) go k.markRunnerAsUsed(newJob, timeout) } newJob.SetupTimeout(time.Duration(timeout) * time.Second) } else { environment.AddRunner(newJob) } return newJob, isUsed, nil } func (k *KubernetesRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) { err := util.RetryExponential(func() (err error) { if err = k.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil { err = fmt.Errorf("cannot mark runner as used: %w", err) } return }) if err != nil { log.WithError(err).WithField(dto.KeyRunnerID, runner.ID()).Error("cannot mark runner as used") err := k.Return(runner) if err != nil { log.WithError(err).WithField(dto.KeyRunnerID, runner.ID()).Error("can't mark runner as used and can't return runner") } } } func (k *KubernetesRunnerManager) onRunnerDestroyed(r Runner) error { k.usedRunners.Delete(r.ID()) environment, ok := k.GetEnvironment(r.Environment()) if ok { environment.DeleteRunner(r.ID()) } return nil }