
* #9 Implement routes to list, get and delete execution environments. A refactoring was required to introduce the ExecutionEnvironment interface. * Fix MR comments, linting issues and bug that lead to e2e test failure * Add e2e tests * Add unit tests
268 lines
9.3 KiB
Go
268 lines
9.3 KiB
Go
package runner
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
nomadApi "github.com/hashicorp/nomad/api"
|
|
"github.com/openHPI/poseidon/internal/nomad"
|
|
"github.com/openHPI/poseidon/pkg/dto"
|
|
"github.com/openHPI/poseidon/pkg/logging"
|
|
"github.com/sirupsen/logrus"
|
|
"strconv"
|
|
"time"
|
|
)
|
|
|
|
var (
|
|
log = logging.GetLogger("runner")
|
|
ErrUnknownExecutionEnvironment = errors.New("execution environment not found")
|
|
ErrNoRunnersAvailable = errors.New("no runners available for this execution environment")
|
|
ErrRunnerNotFound = errors.New("no runner found with this id")
|
|
)
|
|
|
|
// ExecutionEnvironment are groups of runner that share the configuration stored in the environment.
|
|
type ExecutionEnvironment interface {
|
|
json.Marshaler
|
|
|
|
// ID returns the id of the environment.
|
|
ID() dto.EnvironmentID
|
|
SetID(id dto.EnvironmentID)
|
|
// PrewarmingPoolSize sets the number of idle runner of this environment that should be prewarmed.
|
|
PrewarmingPoolSize() uint
|
|
SetPrewarmingPoolSize(count uint)
|
|
// CPULimit sets the share of cpu that a runner should receive at minimum.
|
|
CPULimit() uint
|
|
SetCPULimit(limit uint)
|
|
// MemoryLimit sets the amount of memory that should be available for each runner.
|
|
MemoryLimit() uint
|
|
SetMemoryLimit(limit uint)
|
|
// Image sets the image of the runner, e.g. Docker image.
|
|
Image() string
|
|
SetImage(image string)
|
|
// NetworkAccess sets if a runner should have network access and if ports should be mapped.
|
|
NetworkAccess() (bool, []uint16)
|
|
SetNetworkAccess(allow bool, ports []uint16)
|
|
// SetConfigFrom copies all above attributes from the passed environment to the object itself.
|
|
SetConfigFrom(environment ExecutionEnvironment)
|
|
|
|
// Register saves this environment at the executor.
|
|
Register(apiClient nomad.ExecutorAPI) error
|
|
// Delete removes this environment and all it's runner from the executor and Poseidon itself.
|
|
Delete(apiClient nomad.ExecutorAPI) error
|
|
// Scale manages if the executor has enough idle runner according to the PrewarmingPoolSize.
|
|
Scale(apiClient nomad.ExecutorAPI) error
|
|
// UpdateRunnerSpecs updates all Runner of the passed environment to have the same definition as the environment.
|
|
UpdateRunnerSpecs(apiClient nomad.ExecutorAPI) error
|
|
|
|
// Sample returns and removes an arbitrary available runner.
|
|
// ok is true iff a runner was returned.
|
|
Sample(apiClient nomad.ExecutorAPI) (r Runner, ok bool)
|
|
// AddRunner adds an existing runner to the idle runners of the environment.
|
|
AddRunner(r Runner)
|
|
// DeleteRunner removes an idle runner from the environment.
|
|
DeleteRunner(id string)
|
|
}
|
|
|
|
// Manager keeps track of the used and unused runners of all execution environments in order to provide unused
|
|
// runners to new clients and ensure no runner is used twice.
|
|
type Manager interface {
|
|
// ListEnvironments returns all execution environments known by Poseidon.
|
|
ListEnvironments() []ExecutionEnvironment
|
|
|
|
// GetEnvironment returns the details of the requested environment.
|
|
// Iff the requested environment is not stored it returns false.
|
|
GetEnvironment(id dto.EnvironmentID) (ExecutionEnvironment, bool)
|
|
|
|
// SetEnvironment stores the environment in Poseidons memory.
|
|
// It returns true iff a new environment is stored and false iff an existing environment was updated.
|
|
SetEnvironment(environment ExecutionEnvironment) bool
|
|
|
|
// DeleteEnvironment removes the specified execution environment in Poseidons memory.
|
|
// It does nothing if the specified environment can not be found.
|
|
DeleteEnvironment(id dto.EnvironmentID)
|
|
|
|
// Claim returns a new runner. The runner is deleted after duration seconds if duration is not 0.
|
|
// It makes sure that the runner is not in use yet and returns an error if no runner could be provided.
|
|
Claim(id dto.EnvironmentID, duration int) (Runner, error)
|
|
|
|
// Get returns the used runner with the given runnerId.
|
|
// If no runner with the given runnerId is currently used, it returns an error.
|
|
Get(runnerID string) (Runner, error)
|
|
|
|
// Return signals that the runner is no longer used by the caller and can be claimed by someone else.
|
|
// The runner is deleted or cleaned up for reuse depending on the used executor.
|
|
Return(r Runner) error
|
|
|
|
// Load fetches all already created runners from the executor and registers them.
|
|
// It should be called during the startup process (e.g. on creation of the Manager).
|
|
Load()
|
|
}
|
|
|
|
type NomadRunnerManager struct {
|
|
apiClient nomad.ExecutorAPI
|
|
environments EnvironmentStorage
|
|
usedRunners Storage
|
|
}
|
|
|
|
// NewNomadRunnerManager creates a new runner manager that keeps track of all runners.
|
|
// It uses the apiClient for all requests and runs a background task to keep the runners in sync with Nomad.
|
|
// If you cancel the context the background synchronization will be stopped.
|
|
func NewNomadRunnerManager(apiClient nomad.ExecutorAPI, ctx context.Context) *NomadRunnerManager {
|
|
m := &NomadRunnerManager{
|
|
apiClient,
|
|
NewLocalEnvironmentStorage(),
|
|
NewLocalRunnerStorage(),
|
|
}
|
|
go m.keepRunnersSynced(ctx)
|
|
return m
|
|
}
|
|
|
|
func (m *NomadRunnerManager) ListEnvironments() []ExecutionEnvironment {
|
|
return m.environments.List()
|
|
}
|
|
|
|
func (m *NomadRunnerManager) GetEnvironment(id dto.EnvironmentID) (ExecutionEnvironment, bool) {
|
|
return m.environments.Get(id)
|
|
}
|
|
|
|
func (m *NomadRunnerManager) SetEnvironment(environment ExecutionEnvironment) bool {
|
|
_, ok := m.environments.Get(environment.ID())
|
|
m.environments.Add(environment)
|
|
return !ok
|
|
}
|
|
|
|
func (m *NomadRunnerManager) DeleteEnvironment(id dto.EnvironmentID) {
|
|
m.environments.Delete(id)
|
|
}
|
|
|
|
func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int) (Runner, error) {
|
|
environment, ok := m.environments.Get(environmentID)
|
|
if !ok {
|
|
return nil, ErrUnknownExecutionEnvironment
|
|
}
|
|
runner, ok := environment.Sample(m.apiClient)
|
|
if !ok {
|
|
return nil, ErrNoRunnersAvailable
|
|
}
|
|
m.usedRunners.Add(runner)
|
|
err := m.apiClient.MarkRunnerAsUsed(runner.ID(), duration)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("can't mark runner as used: %w", err)
|
|
}
|
|
|
|
runner.SetupTimeout(time.Duration(duration) * time.Second)
|
|
return runner, nil
|
|
}
|
|
|
|
func (m *NomadRunnerManager) Get(runnerID string) (Runner, error) {
|
|
runner, ok := m.usedRunners.Get(runnerID)
|
|
if !ok {
|
|
return nil, ErrRunnerNotFound
|
|
}
|
|
return runner, nil
|
|
}
|
|
|
|
func (m *NomadRunnerManager) Return(r Runner) error {
|
|
r.StopTimeout()
|
|
err := m.apiClient.DeleteJob(r.ID())
|
|
if err != nil {
|
|
return fmt.Errorf("error deleting runner in Nomad: %w", err)
|
|
}
|
|
m.usedRunners.Delete(r.ID())
|
|
return nil
|
|
}
|
|
|
|
func (m *NomadRunnerManager) Load() {
|
|
for _, environment := range m.environments.List() {
|
|
environmentLogger := log.WithField("environmentID", environment.ID())
|
|
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID().ToString())
|
|
if err != nil {
|
|
environmentLogger.WithError(err).Warn("Error fetching the runner jobs")
|
|
}
|
|
for _, job := range runnerJobs {
|
|
m.loadSingleJob(job, environmentLogger, environment)
|
|
}
|
|
err = environment.Scale(m.apiClient)
|
|
if err != nil {
|
|
environmentLogger.WithError(err).Error("Couldn't scale environment")
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger *logrus.Entry,
|
|
environment ExecutionEnvironment) {
|
|
configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName)
|
|
if configTaskGroup == nil {
|
|
environmentLogger.Infof("Couldn't find config task group in job %s, skipping ...", *job.ID)
|
|
return
|
|
}
|
|
isUsed := configTaskGroup.Meta[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue
|
|
portMappings, err := m.apiClient.LoadRunnerPortMappings(*job.ID)
|
|
if err != nil {
|
|
environmentLogger.WithError(err).Warn("Error loading runner portMappings")
|
|
return
|
|
}
|
|
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m)
|
|
if isUsed {
|
|
m.usedRunners.Add(newJob)
|
|
timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey])
|
|
if err != nil {
|
|
environmentLogger.WithError(err).Warn("Error loading timeout from meta values")
|
|
} else {
|
|
newJob.SetupTimeout(time.Duration(timeout) * time.Second)
|
|
}
|
|
} else {
|
|
environment.AddRunner(newJob)
|
|
}
|
|
}
|
|
|
|
func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
|
|
retries := 0
|
|
for ctx.Err() == nil {
|
|
err := m.apiClient.WatchAllocations(ctx, m.onAllocationAdded, m.onAllocationStopped)
|
|
retries += 1
|
|
log.WithError(err).Errorf("Stopped updating the runners! Retry %v", retries)
|
|
<-time.After(time.Second)
|
|
}
|
|
}
|
|
|
|
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation) {
|
|
log.WithField("id", alloc.JobID).Debug("Runner started")
|
|
|
|
if nomad.IsEnvironmentTemplateID(alloc.JobID) {
|
|
return
|
|
}
|
|
|
|
environmentID, err := nomad.EnvironmentIDFromRunnerID(alloc.JobID)
|
|
if err != nil {
|
|
log.WithError(err).Warn("Allocation could not be added")
|
|
return
|
|
}
|
|
|
|
environment, ok := m.environments.Get(environmentID)
|
|
if ok {
|
|
var mappedPorts []nomadApi.PortMapping
|
|
if alloc.AllocatedResources != nil {
|
|
mappedPorts = alloc.AllocatedResources.Shared.Ports
|
|
}
|
|
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m))
|
|
}
|
|
}
|
|
|
|
func (m *NomadRunnerManager) onAllocationStopped(alloc *nomadApi.Allocation) {
|
|
log.WithField("id", alloc.JobID).Debug("Runner stopped")
|
|
|
|
environmentID, err := nomad.EnvironmentIDFromRunnerID(alloc.JobID)
|
|
if err != nil {
|
|
log.WithError(err).Warn("Stopped allocation can not be handled")
|
|
return
|
|
}
|
|
|
|
m.usedRunners.Delete(alloc.JobID)
|
|
environment, ok := m.environments.Get(environmentID)
|
|
if ok {
|
|
environment.DeleteRunner(alloc.JobID)
|
|
}
|
|
}
|