Use Nomad jobs as runners instead of allocations

As we can't control which allocations are destroyed when downscaling a job, we decided
to use Nomad jobs as our runners. Thus for each runner we prewarm for an environment,
a corresponding job is created in Nomad. We create a default job that serves as a template
for the runners. Using this, already existing execution environments can easily be restored,
once Poseidon is restarted.
This commit is contained in:
sirkrypt0
2021-06-08 14:42:35 +02:00
committed by Maximilian Paß
parent 8de489929e
commit c7d59810e5
20 changed files with 333 additions and 266 deletions

View File

@@ -3,6 +3,8 @@ package runner
import (
"context"
"errors"
"fmt"
"github.com/google/uuid"
nomadApi "github.com/hashicorp/nomad/api"
"gitlab.hpi.de/codeocean/codemoon/poseidon/logging"
"gitlab.hpi.de/codeocean/codemoon/poseidon/nomad"
@@ -17,10 +19,12 @@ var (
ErrRunnerNotFound = errors.New("no runner found with this id")
)
const runnerNameFormat = "%s-%s"
type EnvironmentID int
func (e EnvironmentID) toString() string {
return string(rune(e))
return strconv.Itoa(int(e))
}
type NomadJobID string
@@ -29,7 +33,7 @@ type NomadJobID string
// runners to new clients and ensure no runner is used twice.
type Manager interface {
// RegisterEnvironment adds a new environment that should be managed.
RegisterEnvironment(id EnvironmentID, nomadJobID NomadJobID, desiredIdleRunnersCount uint)
RegisterEnvironment(id EnvironmentID, desiredIdleRunnersCount uint) error
// EnvironmentExists returns whether the environment with the given id exists.
EnvironmentExists(id EnvironmentID) bool
@@ -48,9 +52,9 @@ type Manager interface {
}
type NomadRunnerManager struct {
apiClient nomad.ExecutorAPI
jobs NomadJobStorage
usedRunners Storage
apiClient nomad.ExecutorAPI
environments NomadEnvironmentStorage
usedRunners Storage
}
// NewNomadRunnerManager creates a new runner manager that keeps track of all runners.
@@ -66,35 +70,43 @@ func NewNomadRunnerManager(apiClient nomad.ExecutorAPI, ctx context.Context) *No
return m
}
type NomadJob struct {
type NomadEnvironment struct {
environmentID EnvironmentID
jobID NomadJobID
idleRunners Storage
desiredIdleRunnersCount uint
templateJob *nomadApi.Job
}
func (j *NomadJob) ID() EnvironmentID {
func (j *NomadEnvironment) ID() EnvironmentID {
return j.environmentID
}
func (m *NomadRunnerManager) RegisterEnvironment(environmentID EnvironmentID, nomadJobID NomadJobID,
desiredIdleRunnersCount uint) {
m.jobs.Add(&NomadJob{
func (m *NomadRunnerManager) RegisterEnvironment(environmentID EnvironmentID, desiredIdleRunnersCount uint) error {
templateJob, err := m.apiClient.LoadTemplateJob(environmentID.toString())
if err != nil {
return fmt.Errorf("couldn't register environment: %w", err)
}
m.environments.Add(&NomadEnvironment{
environmentID,
nomadJobID,
NewLocalRunnerStorage(),
desiredIdleRunnersCount,
templateJob,
})
go m.refreshEnvironment(environmentID)
err = m.scaleEnvironment(environmentID)
if err != nil {
return fmt.Errorf("couldn't upscale environment %w", err)
}
return nil
}
func (m *NomadRunnerManager) EnvironmentExists(id EnvironmentID) (ok bool) {
_, ok = m.jobs.Get(id)
_, ok = m.environments.Get(id)
return
}
func (m *NomadRunnerManager) Claim(environmentID EnvironmentID) (Runner, error) {
job, ok := m.jobs.Get(environmentID)
job, ok := m.environments.Get(environmentID)
if !ok {
return nil, ErrUnknownExecutionEnvironment
}
@@ -103,6 +115,10 @@ func (m *NomadRunnerManager) Claim(environmentID EnvironmentID) (Runner, error)
return nil, ErrNoRunnersAvailable
}
m.usedRunners.Add(runner)
err := m.scaleEnvironment(environmentID)
if err != nil {
return nil, fmt.Errorf("can not scale up: %w", err)
}
return runner, nil
}
@@ -141,7 +157,7 @@ func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation) {
return
}
job, ok := m.jobs.Get(EnvironmentID(intJobID))
job, ok := m.environments.Get(EnvironmentID(intJobID))
if ok {
job.idleRunners.Add(NewNomadAllocation(alloc.ID, m.apiClient))
}
@@ -156,58 +172,55 @@ func (m *NomadRunnerManager) onAllocationStopped(alloc *nomadApi.Allocation) {
}
m.usedRunners.Delete(alloc.ID)
job, ok := m.jobs.Get(EnvironmentID(intJobID))
job, ok := m.environments.Get(EnvironmentID(intJobID))
if ok {
job.idleRunners.Delete(alloc.ID)
}
}
// Refresh Big ToDo: Improve this function!! State out that it also rescales the job; Provide context to be terminable...
func (m *NomadRunnerManager) refreshEnvironment(id EnvironmentID) {
job, ok := m.jobs.Get(id)
// scaleEnvironment makes sure that the amount of idle runners is at least the desiredIdleRunnersCount.
func (m *NomadRunnerManager) scaleEnvironment(id EnvironmentID) error {
environment, ok := m.environments.Get(id)
if !ok {
// this environment does not exist
return
return ErrUnknownExecutionEnvironment
}
var lastJobScaling = 0
for {
runners, err := m.apiClient.LoadRunners(string(job.jobID))
if err != nil {
log.WithError(err).Printf("Failed fetching runners")
break
}
for _, r := range m.unusedRunners(id, runners) {
// ToDo: Listen on Nomad event stream
log.Printf("Adding allocation %+v", r)
job.idleRunners.Add(r)
}
jobScale, err := m.apiClient.JobScale(string(job.jobID))
required := int(environment.desiredIdleRunnersCount) - environment.idleRunners.Length()
for i := 0; i < required; i++ {
err := m.createRunner(environment)
if err != nil {
log.WithError(err).WithField("job", string(job.jobID)).Printf("Failed get allocation count")
break
}
additionallyNeededRunners := int(job.desiredIdleRunnersCount) - job.idleRunners.Length()
requiredRunnerCount := int(jobScale)
if additionallyNeededRunners > 0 {
requiredRunnerCount += additionallyNeededRunners
}
time.Sleep(50 * time.Millisecond)
if requiredRunnerCount != lastJobScaling {
log.Printf("Set job scaling %d", requiredRunnerCount)
err = m.apiClient.SetJobScale(string(job.jobID), uint(requiredRunnerCount), "Runner Requested")
if err != nil {
log.WithError(err).Printf("Failed set allocation scaling")
continue
}
lastJobScaling = requiredRunnerCount
return fmt.Errorf("couldn't create new runner: %w", err)
}
}
return nil
}
func (m *NomadRunnerManager) unusedRunners(environmentId EnvironmentID, fetchedRunnerIds []string) (newRunners []Runner) {
func (m *NomadRunnerManager) createRunner(environment *NomadEnvironment) error {
newUUID, err := uuid.NewUUID()
if err != nil {
return fmt.Errorf("failed generating runner id")
}
newRunnerID := fmt.Sprintf(runnerNameFormat, environment.ID().toString(), newUUID.String())
template := *environment.templateJob
template.ID = &newRunnerID
template.Name = &newRunnerID
evalID, err := m.apiClient.RegisterNomadJob(&template)
if err != nil {
return fmt.Errorf("couldn't register Nomad job: %w", err)
}
err = m.apiClient.MonitorEvaluation(evalID, context.Background())
if err != nil {
return fmt.Errorf("couldn't monitor evaluation: %w", err)
}
environment.idleRunners.Add(NewNomadJob(newRunnerID, m.apiClient))
return nil
}
func (m *NomadRunnerManager) unusedRunners(environmentID EnvironmentID, fetchedRunnerIds []string) (newRunners []Runner) {
newRunners = make([]Runner, 0)
job, ok := m.jobs.Get(environmentId)
job, ok := m.environments.Get(environmentID)
if !ok {
// the environment does not exist, so it won't have any unused runners
return
@@ -217,7 +230,7 @@ func (m *NomadRunnerManager) unusedRunners(environmentId EnvironmentID, fetchedR
if !ok {
_, ok = job.idleRunners.Get(runnerID)
if !ok {
newRunners = append(newRunners, NewNomadAllocation(runnerID, m.apiClient))
newRunners = append(newRunners, NewNomadJob(runnerID, m.apiClient))
}
}
}