
To be able to restore the runner timeouts even after a Poseidon restart, the timeout is stored in the Nomad metadata. The timeout will restart, but at least the runner will be returned at all.
232 lines
7.2 KiB
Go
232 lines
7.2 KiB
Go
package nomad
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
nomadApi "github.com/hashicorp/nomad/api"
|
|
"strconv"
|
|
)
|
|
|
|
const (
|
|
TaskGroupName = "default-group"
|
|
TaskName = "default-task"
|
|
TemplateJobPrefix = "template"
|
|
ConfigTaskGroupName = "config"
|
|
DummyTaskName = "dummy"
|
|
DefaultTaskDriver = "docker"
|
|
DefaultDummyTaskDriver = "exec"
|
|
DefaultDummyTaskCommand = "true"
|
|
ConfigMetaEnvironmentKey = "environment"
|
|
ConfigMetaUsedKey = "used"
|
|
ConfigMetaUsedValue = "true"
|
|
ConfigMetaUnusedValue = "false"
|
|
ConfigMetaTimeoutKey = "timeout"
|
|
ConfigMetaPoolSizeKey = "prewarmingPoolSize"
|
|
)
|
|
|
|
var (
|
|
ErrorConfigTaskGroupNotFound = errors.New("config task group not found in job")
|
|
)
|
|
|
|
// FindConfigTaskGroup returns the config task group of a job.
|
|
// The config task group should be included in all jobs.
|
|
func FindConfigTaskGroup(job *nomadApi.Job) *nomadApi.TaskGroup {
|
|
for _, tg := range job.TaskGroups {
|
|
if *tg.Name == ConfigTaskGroupName {
|
|
return tg
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func SetMetaConfigValue(job *nomadApi.Job, key, value string) error {
|
|
configTaskGroup := FindConfigTaskGroup(job)
|
|
if configTaskGroup == nil {
|
|
return ErrorConfigTaskGroupNotFound
|
|
}
|
|
configTaskGroup.Meta[key] = value
|
|
return nil
|
|
}
|
|
|
|
// RegisterTemplateJob creates a Nomad job based on the default job configuration and the given parameters.
|
|
// It registers the job with Nomad and waits until the registration completes.
|
|
func (a *APIClient) RegisterTemplateJob(
|
|
basisJob *nomadApi.Job,
|
|
id string,
|
|
prewarmingPoolSize, cpuLimit, memoryLimit uint,
|
|
image string,
|
|
networkAccess bool,
|
|
exposedPorts []uint16) (*nomadApi.Job, error) {
|
|
job := CreateTemplateJob(basisJob, id, prewarmingPoolSize,
|
|
cpuLimit, memoryLimit, image, networkAccess, exposedPorts)
|
|
evalID, err := a.apiQuerier.RegisterNomadJob(job)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("couldn't register template job: %w", err)
|
|
}
|
|
return job, a.MonitorEvaluation(evalID, context.Background())
|
|
}
|
|
|
|
// CreateTemplateJob creates a Nomad job based on the default job configuration and the given parameters.
|
|
// It registers the job with Nomad and waits until the registration completes.
|
|
func CreateTemplateJob(
|
|
basisJob *nomadApi.Job,
|
|
id string,
|
|
prewarmingPoolSize, cpuLimit, memoryLimit uint,
|
|
image string,
|
|
networkAccess bool,
|
|
exposedPorts []uint16) *nomadApi.Job {
|
|
job := *basisJob
|
|
job.ID = &id
|
|
job.Name = &id
|
|
|
|
var taskGroup = createTaskGroup(&job, TaskGroupName, prewarmingPoolSize)
|
|
configureTask(taskGroup, TaskName, cpuLimit, memoryLimit, image, networkAccess, exposedPorts)
|
|
storeTemplateConfiguration(&job, prewarmingPoolSize)
|
|
|
|
return &job
|
|
}
|
|
|
|
func (a *APIClient) RegisterRunnerJob(template *nomadApi.Job) error {
|
|
storeRunnerConfiguration(template)
|
|
|
|
evalID, err := a.apiQuerier.RegisterNomadJob(template)
|
|
if err != nil {
|
|
return fmt.Errorf("couldn't register runner job: %w", err)
|
|
}
|
|
return a.MonitorEvaluation(evalID, context.Background())
|
|
}
|
|
|
|
func createTaskGroup(job *nomadApi.Job, name string, prewarmingPoolSize uint) *nomadApi.TaskGroup {
|
|
var taskGroup *nomadApi.TaskGroup
|
|
if len(job.TaskGroups) == 0 {
|
|
taskGroup = nomadApi.NewTaskGroup(name, int(prewarmingPoolSize))
|
|
job.TaskGroups = []*nomadApi.TaskGroup{taskGroup}
|
|
} else {
|
|
taskGroup = job.TaskGroups[0]
|
|
taskGroup.Name = &name
|
|
count := 1
|
|
taskGroup.Count = &count
|
|
}
|
|
return taskGroup
|
|
}
|
|
|
|
func configureNetwork(taskGroup *nomadApi.TaskGroup, networkAccess bool, exposedPorts []uint16) {
|
|
if len(taskGroup.Tasks) == 0 {
|
|
// This function is only used internally and must be called as last step when configuring the task.
|
|
// This error is not recoverable.
|
|
log.Fatal("Can't configure network before task has been configured!")
|
|
}
|
|
task := taskGroup.Tasks[0]
|
|
|
|
if task.Config == nil {
|
|
task.Config = make(map[string]interface{})
|
|
}
|
|
|
|
if networkAccess {
|
|
var networkResource *nomadApi.NetworkResource
|
|
if len(taskGroup.Networks) == 0 {
|
|
networkResource = &nomadApi.NetworkResource{}
|
|
taskGroup.Networks = []*nomadApi.NetworkResource{networkResource}
|
|
} else {
|
|
networkResource = taskGroup.Networks[0]
|
|
}
|
|
// Prefer "bridge" network over "host" to have an isolated network namespace with bridged interface
|
|
// instead of joining the host network namespace.
|
|
networkResource.Mode = "bridge"
|
|
for _, portNumber := range exposedPorts {
|
|
port := nomadApi.Port{
|
|
Label: strconv.FormatUint(uint64(portNumber), 10),
|
|
To: int(portNumber),
|
|
}
|
|
networkResource.DynamicPorts = append(networkResource.DynamicPorts, port)
|
|
}
|
|
|
|
// Explicitly set mode to override existing settings when updating job from without to with network.
|
|
// Don't use bridge as it collides with the bridge mode above. This results in Docker using 'bridge'
|
|
// mode, meaning all allocations will be attached to the `docker0` adapter and could reach other
|
|
// non-Nomad containers attached to it. This is avoided when using Nomads bridge network mode.
|
|
task.Config["network_mode"] = ""
|
|
} else {
|
|
// Somehow, we can't set the network mode to none in the NetworkResource on task group level.
|
|
// See https://github.com/hashicorp/nomad/issues/10540
|
|
task.Config["network_mode"] = "none"
|
|
// Explicitly set Networks to signal Nomad to remove the possibly existing networkResource
|
|
taskGroup.Networks = []*nomadApi.NetworkResource{}
|
|
}
|
|
}
|
|
|
|
func configureTask(
|
|
taskGroup *nomadApi.TaskGroup,
|
|
name string,
|
|
cpuLimit, memoryLimit uint,
|
|
image string,
|
|
networkAccess bool,
|
|
exposedPorts []uint16) {
|
|
var task *nomadApi.Task
|
|
if len(taskGroup.Tasks) == 0 {
|
|
task = nomadApi.NewTask(name, DefaultTaskDriver)
|
|
taskGroup.Tasks = []*nomadApi.Task{task}
|
|
} else {
|
|
task = taskGroup.Tasks[0]
|
|
task.Name = name
|
|
}
|
|
integerCPULimit := int(cpuLimit)
|
|
integerMemoryLimit := int(memoryLimit)
|
|
task.Resources = &nomadApi.Resources{
|
|
CPU: &integerCPULimit,
|
|
MemoryMB: &integerMemoryLimit,
|
|
}
|
|
|
|
if task.Config == nil {
|
|
task.Config = make(map[string]interface{})
|
|
}
|
|
task.Config["image"] = image
|
|
|
|
configureNetwork(taskGroup, networkAccess, exposedPorts)
|
|
}
|
|
|
|
func storeTemplateConfiguration(job *nomadApi.Job, prewarmingPoolSize uint) {
|
|
taskGroup := findOrCreateConfigTaskGroup(job)
|
|
|
|
taskGroup.Meta = make(map[string]string)
|
|
taskGroup.Meta[ConfigMetaPoolSizeKey] = strconv.Itoa(int(prewarmingPoolSize))
|
|
}
|
|
|
|
func storeRunnerConfiguration(job *nomadApi.Job) {
|
|
taskGroup := findOrCreateConfigTaskGroup(job)
|
|
|
|
taskGroup.Meta = make(map[string]string)
|
|
taskGroup.Meta[ConfigMetaUsedKey] = ConfigMetaUnusedValue
|
|
}
|
|
|
|
func findOrCreateConfigTaskGroup(job *nomadApi.Job) *nomadApi.TaskGroup {
|
|
taskGroup := FindConfigTaskGroup(job)
|
|
if taskGroup == nil {
|
|
taskGroup = nomadApi.NewTaskGroup(ConfigTaskGroupName, 0)
|
|
}
|
|
createDummyTaskIfNotPresent(taskGroup)
|
|
return taskGroup
|
|
}
|
|
|
|
// createDummyTaskIfNotPresent ensures that a dummy task is in the task group so that the group is accepted by Nomad.
|
|
func createDummyTaskIfNotPresent(taskGroup *nomadApi.TaskGroup) {
|
|
var task *nomadApi.Task
|
|
for _, t := range taskGroup.Tasks {
|
|
if t.Name == DummyTaskName {
|
|
task = t
|
|
break
|
|
}
|
|
}
|
|
|
|
if task == nil {
|
|
task = nomadApi.NewTask(DummyTaskName, DefaultDummyTaskDriver)
|
|
taskGroup.Tasks = append(taskGroup.Tasks, task)
|
|
}
|
|
|
|
if task.Config == nil {
|
|
task.Config = make(map[string]interface{})
|
|
}
|
|
task.Config["command"] = DefaultDummyTaskCommand
|
|
}
|