Add independent environment reload

in the case that the prewarming pool is depleting (see PrewarmingPoolThreshold) and is still depleting after a timeout (PrewarmingPoolReloadTimeout).
This commit is contained in:
Maximilian Paß
2023-11-03 15:31:26 +01:00
committed by Sebastian Serth
parent c46a09eeae
commit 543939e5cb
10 changed files with 295 additions and 80 deletions

View File

@ -19,10 +19,16 @@ server:
interactivestderr: true interactivestderr: true
# If set, the file at the given path overwrites the default Nomad job file in internal/environment/template-environment-job.hcl # If set, the file at the given path overwrites the default Nomad job file in internal/environment/template-environment-job.hcl
# templatejobfile: ./poseidon.hcl # templatejobfile: ./poseidon.hcl
# The prewarming pool alert threshold [0, 1) defines which part of the prewarming pool should always be filled. # alert defines how poseidon should handle specific risks.
# Setting it to 0 will disable the alert. alert:
# If the prewarming pool is filled for less than, i.e., 50%, the health route of Poseidon will return a warning. # The prewarming pool threshold [0, 1) defines which part of the prewarming pool should always be filled.
prewarmingpoolalertthreshold: 0.5 # Setting it to 0 will disable the alert.
# If the prewarming pool is filled for less than, i.e., 50%, the health route of Poseidon will return a warning.
prewarmingpoolthreshold: 0.5
# The prewarming pool reload timeout (in seconds) defines for how long the low prewarming pool warning (above)
# should be active before Poseidon automatically reloads the environment.
# Setting it to 0 will disable the automatic reload.
prewarmingpoolreloadtimeout: 300
# Configuration of the used Nomad cluster # Configuration of the used Nomad cluster
nomad: nomad:

View File

@ -3,6 +3,8 @@ package api
import ( import (
"github.com/gorilla/mux" "github.com/gorilla/mux"
"github.com/openHPI/poseidon/internal/config" "github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/environment"
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/tests" "github.com/openHPI/poseidon/tests"
"github.com/stretchr/testify/suite" "github.com/stretchr/testify/suite"
"net/http" "net/http"
@ -25,7 +27,9 @@ func TestMainTestSuite(t *testing.T) {
func (s *MainTestSuite) TestNewRouterV1WithAuthenticationDisabled() { func (s *MainTestSuite) TestNewRouterV1WithAuthenticationDisabled() {
config.Config.Server.Token = "" config.Config.Server.Token = ""
router := mux.NewRouter() router := mux.NewRouter()
configureV1Router(router, nil, nil) m := &environment.ManagerHandlerMock{}
m.On("Statistics").Return(make(map[dto.EnvironmentID]*dto.StatisticalExecutionEnvironmentData))
configureV1Router(router, nil, m)
s.Run("health route is accessible", func() { s.Run("health route is accessible", func() {
request, err := http.NewRequest(http.MethodGet, "/api/v1/health", http.NoBody) request, err := http.NewRequest(http.MethodGet, "/api/v1/health", http.NoBody)
@ -52,7 +56,9 @@ func (s *MainTestSuite) TestNewRouterV1WithAuthenticationDisabled() {
func (s *MainTestSuite) TestNewRouterV1WithAuthenticationEnabled() { func (s *MainTestSuite) TestNewRouterV1WithAuthenticationEnabled() {
config.Config.Server.Token = "TestToken" config.Config.Server.Token = "TestToken"
router := mux.NewRouter() router := mux.NewRouter()
configureV1Router(router, nil, nil) m := &environment.ManagerHandlerMock{}
m.On("Statistics").Return(make(map[dto.EnvironmentID]*dto.StatisticalExecutionEnvironmentData))
configureV1Router(router, nil, m)
s.Run("health route is accessible", func() { s.Run("health route is accessible", func() {
request, err := http.NewRequest(http.MethodGet, "/api/v1/health", http.NoBody) request, err := http.NewRequest(http.MethodGet, "/api/v1/health", http.NoBody)

View File

@ -30,7 +30,7 @@ func Health(manager environment.Manager) http.HandlerFunc {
func checkPrewarmingPool(manager environment.Manager) error { func checkPrewarmingPool(manager environment.Manager) error {
var depletingEnvironments []int var depletingEnvironments []int
for _, data := range manager.Statistics() { for _, data := range manager.Statistics() {
if float64(data.IdleRunners)/float64(data.PrewarmingPoolSize) < config.Config.Server.PrewarmingPoolAlertThreshold { if float64(data.IdleRunners)/float64(data.PrewarmingPoolSize) < config.Config.Server.Alert.PrewarmingPoolThreshold {
depletingEnvironments = append(depletingEnvironments, data.ID) depletingEnvironments = append(depletingEnvironments, data.ID)
} }
} }

View File

@ -39,7 +39,7 @@ func (s *MainTestSuite) TestHealth() {
IdleRunners: 1, IdleRunners: 1,
}, },
}) })
config.Config.Server.PrewarmingPoolAlertThreshold = 0.5 config.Config.Server.Alert.PrewarmingPoolThreshold = 0.5
Health(manager).ServeHTTP(recorder, request) Health(manager).ServeHTTP(recorder, request)
s.Equal(http.StatusServiceUnavailable, recorder.Code) s.Equal(http.StatusServiceUnavailable, recorder.Code)

View File

@ -29,9 +29,12 @@ var (
CertFile: "", CertFile: "",
KeyFile: "", KeyFile: "",
}, },
InteractiveStderr: true, InteractiveStderr: true,
TemplateJobFile: "", TemplateJobFile: "",
PrewarmingPoolAlertThreshold: 0, Alert: alert{
PrewarmingPoolThreshold: 0,
PrewarmingPoolReloadTimeout: 0,
},
}, },
Nomad: Nomad{ Nomad: Nomad{
Enabled: true, Enabled: true,
@ -80,15 +83,20 @@ var (
ErrConfigInitialized = errors.New("configuration is already initialized") ErrConfigInitialized = errors.New("configuration is already initialized")
) )
type alert struct {
PrewarmingPoolThreshold float64
PrewarmingPoolReloadTimeout uint
}
// server configures the Poseidon webserver. // server configures the Poseidon webserver.
type server struct { type server struct {
Address string Address string
Port int Port int
Token string Token string
TLS TLS TLS TLS
InteractiveStderr bool InteractiveStderr bool
TemplateJobFile string TemplateJobFile string
PrewarmingPoolAlertThreshold float64 Alert alert
} }
// URL returns the URL of the Poseidon webserver. // URL returns the URL of the Poseidon webserver.

View File

@ -30,11 +30,13 @@ const (
ConfigMetaPoolSizeKey = "prewarmingPoolSize" ConfigMetaPoolSizeKey = "prewarmingPoolSize"
TemplateJobNameParts = 2 TemplateJobNameParts = 2
RegisterTimeout = 10 * time.Second RegisterTimeout = 10 * time.Second
RunnerTimeoutFallback = 60 * time.Second
) )
var ( var (
ErrorInvalidJobID = errors.New("invalid job id") ErrorInvalidJobID = errors.New("invalid job id")
TaskArgs = []string{"infinity"} ErrorMissingTaskGroup = errors.New("couldn't find config task group in job")
TaskArgs = []string{"infinity"}
) )
func (a *APIClient) RegisterRunnerJob(template *nomadApi.Job) error { func (a *APIClient) RegisterRunnerJob(template *nomadApi.Job) error {

View File

@ -17,7 +17,7 @@ func NewAWSRunnerManager(ctx context.Context) *AWSRunnerManager {
} }
func (a AWSRunnerManager) Claim(id dto.EnvironmentID, duration int) (Runner, error) { func (a AWSRunnerManager) Claim(id dto.EnvironmentID, duration int) (Runner, error) {
environment, ok := a.environments.Get(id.ToString()) environment, ok := a.GetEnvironment(id)
if !ok { if !ok {
r, err := a.NextHandler().Claim(id, duration) r, err := a.NextHandler().Claim(id, duration)
if err != nil { if err != nil {

View File

@ -112,5 +112,7 @@ func createBasicEnvironmentMock(id dto.EnvironmentID) *ExecutionEnvironmentMock
environment.On("NetworkAccess").Return(false, nil) environment.On("NetworkAccess").Return(false, nil)
environment.On("DeleteRunner", mock.AnythingOfType("string")).Return(false) environment.On("DeleteRunner", mock.AnythingOfType("string")).Return(false)
environment.On("ApplyPrewarmingPoolSize").Return(nil) environment.On("ApplyPrewarmingPoolSize").Return(nil)
environment.On("IdleRunnerCount").Return(uint(1)).Maybe()
environment.On("PrewarmingPoolSize").Return(uint(1)).Maybe()
return environment return environment
} }

View File

@ -6,14 +6,15 @@ import (
"fmt" "fmt"
nomadApi "github.com/hashicorp/nomad/api" nomadApi "github.com/hashicorp/nomad/api"
influxdb2 "github.com/influxdata/influxdb-client-go/v2" influxdb2 "github.com/influxdata/influxdb-client-go/v2"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/nomad" "github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/logging" "github.com/openHPI/poseidon/pkg/logging"
"github.com/openHPI/poseidon/pkg/monitoring" "github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage" "github.com/openHPI/poseidon/pkg/storage"
"github.com/openHPI/poseidon/pkg/util" "github.com/openHPI/poseidon/pkg/util"
"github.com/sirupsen/logrus"
"strconv" "strconv"
"sync"
"time" "time"
) )
@ -26,21 +27,23 @@ var (
type NomadRunnerManager struct { type NomadRunnerManager struct {
*AbstractManager *AbstractManager
apiClient nomad.ExecutorAPI apiClient nomad.ExecutorAPI
reloadingEnvironment map[dto.EnvironmentID]*sync.Mutex
} }
// NewNomadRunnerManager creates a new runner manager that keeps track of all runners. // NewNomadRunnerManager creates a new runner manager that keeps track of all runners.
// KeepRunnersSynced has to be started separately. // KeepRunnersSynced has to be started separately.
func NewNomadRunnerManager(apiClient nomad.ExecutorAPI, ctx context.Context) *NomadRunnerManager { func NewNomadRunnerManager(apiClient nomad.ExecutorAPI, ctx context.Context) *NomadRunnerManager {
return &NomadRunnerManager{NewAbstractManager(ctx), apiClient} return &NomadRunnerManager{NewAbstractManager(ctx), apiClient, make(map[dto.EnvironmentID]*sync.Mutex)}
} }
func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int) (Runner, error) { func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int) (Runner, error) {
environment, ok := m.environments.Get(environmentID.ToString()) environment, ok := m.GetEnvironment(environmentID)
if !ok { if !ok {
return nil, ErrUnknownExecutionEnvironment return nil, ErrUnknownExecutionEnvironment
} }
runner, ok := environment.Sample() runner, ok := environment.Sample()
go m.checkPrewarmingPoolAlert(environment)
if !ok { if !ok {
return nil, ErrNoRunnersAvailable return nil, ErrNoRunnersAvailable
} }
@ -52,6 +55,44 @@ func (m *NomadRunnerManager) Claim(environmentID dto.EnvironmentID, duration int
return runner, nil return runner, nil
} }
// checkPrewarmingPoolAlert checks if the prewarming pool contains enough idle runners as specified by the PrewarmingPoolThreshold
// if not it starts an environment reload mechanism according to the PrewarmingPoolReloadTimeout.
func (m *NomadRunnerManager) checkPrewarmingPoolAlert(environment ExecutionEnvironment) {
mutex := m.reloadingEnvironment[environment.ID()]
if !mutex.TryLock() {
// The environment is already about to be reloaded
return
}
defer mutex.Unlock()
prewarmingPoolThreshold := config.Config.Server.Alert.PrewarmingPoolThreshold
reloadTimeout := config.Config.Server.Alert.PrewarmingPoolReloadTimeout
if reloadTimeout == 0 || float64(environment.IdleRunnerCount())/float64(environment.PrewarmingPoolSize()) >= prewarmingPoolThreshold {
return
}
log.WithField(dto.KeyEnvironmentID, environment.ID()).Info("Prewarming Pool Alert. Checking again..")
<-time.After(time.Duration(reloadTimeout) * time.Second)
if float64(environment.IdleRunnerCount())/float64(environment.PrewarmingPoolSize()) >= prewarmingPoolThreshold {
return
}
log.WithField(dto.KeyEnvironmentID, environment.ID()).Info("Prewarming Pool Alert. Reloading environment")
err := util.RetryExponential(func() error {
usedRunners, err := m.loadEnvironment(environment)
if err != nil {
return err
}
m.updateUsedRunners(usedRunners, false)
return nil
})
if err != nil {
log.WithField(dto.KeyEnvironmentID, environment.ID()).Error("Failed to reload environment")
}
}
func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) { func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int) {
err := util.RetryExponential(func() (err error) { err := util.RetryExponential(func() (err error) {
if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil { if err = m.apiClient.MarkRunnerAsUsed(runner.ID(), timeoutDuration); err != nil {
@ -80,9 +121,7 @@ func (m *NomadRunnerManager) Return(r Runner) error {
// SynchronizeRunners loads all runners and keeps them synchronized (without a retry mechanism). // SynchronizeRunners loads all runners and keeps them synchronized (without a retry mechanism).
func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error { func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error {
log.Info("Loading runners") log.Info("Loading runners")
if err := m.load(); err != nil { m.load()
return fmt.Errorf("failed loading runners: %w", err)
}
// Watch for changes regarding the existing or new runners. // Watch for changes regarding the existing or new runners.
log.Info("Watching Event Stream") log.Info("Watching Event Stream")
@ -95,69 +134,101 @@ func (m *NomadRunnerManager) SynchronizeRunners(ctx context.Context) error {
return err return err
} }
// Load recovers all runners for all existing environments. func (m *NomadRunnerManager) StoreEnvironment(environment ExecutionEnvironment) {
func (m *NomadRunnerManager) load() error { m.AbstractManager.StoreEnvironment(environment)
newUsedRunners := storage.NewLocalStorage[Runner]() m.reloadingEnvironment[environment.ID()] = &sync.Mutex{}
for _, environment := range m.environments.List() {
environmentLogger := log.WithField(dto.KeyEnvironmentID, environment.ID().ToString())
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID())
if err != nil {
return fmt.Errorf("failed fetching the runner jobs: %w", err)
}
for _, job := range runnerJobs {
m.loadSingleJob(job, environmentLogger, environment, newUsedRunners)
}
err = environment.ApplyPrewarmingPoolSize()
if err != nil {
return fmt.Errorf("couldn't scale environment: %w", err)
}
}
m.updateUsedRunners(newUsedRunners)
return nil
} }
func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger *logrus.Entry, func (m *NomadRunnerManager) DeleteEnvironment(id dto.EnvironmentID) {
environment ExecutionEnvironment, newUsedRunners storage.Storage[Runner]) { m.AbstractManager.DeleteEnvironment(id)
delete(m.reloadingEnvironment, id)
}
// Load recovers all runners for all existing environments.
func (m *NomadRunnerManager) load() {
newUsedRunners := storage.NewLocalStorage[Runner]()
for _, environment := range m.ListEnvironments() {
usedRunners, err := m.loadEnvironment(environment)
if err != nil {
log.WithError(err).WithField(dto.KeyEnvironmentID, environment.ID().ToString()).
Warn("Failed loading environment. Skipping ...")
continue
}
for _, r := range usedRunners.List() {
newUsedRunners.Add(r.ID(), r)
}
}
m.updateUsedRunners(newUsedRunners, true)
}
func (m *NomadRunnerManager) loadEnvironment(environment ExecutionEnvironment) (used storage.Storage[Runner], err error) {
used = storage.NewLocalStorage[Runner]()
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID())
if err != nil {
return nil, fmt.Errorf("failed fetching the runner jobs: %w", err)
}
for _, job := range runnerJobs {
r, isUsed, err := m.loadSingleJob(job, environment)
if err != nil {
log.WithError(err).WithField(dto.KeyEnvironmentID, environment.ID().ToString()).
WithField("used", isUsed).Warn("Failed loading job. Skipping ...")
continue
} else if isUsed {
used.Add(r.ID(), r)
}
}
err = environment.ApplyPrewarmingPoolSize()
if err != nil {
return used, fmt.Errorf("couldn't scale environment: %w", err)
}
return used, nil
}
func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environment ExecutionEnvironment) (r Runner, isUsed bool, err error) {
configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName) configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName)
if configTaskGroup == nil { if configTaskGroup == nil {
environmentLogger.Warnf("Couldn't find config task group in job %s, skipping ...", *job.ID) return nil, false, fmt.Errorf("%w, %s", nomad.ErrorMissingTaskGroup, *job.ID)
return
} }
isUsed := configTaskGroup.Meta[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue isUsed = configTaskGroup.Meta[nomad.ConfigMetaUsedKey] == nomad.ConfigMetaUsedValue
portMappings, err := m.apiClient.LoadRunnerPortMappings(*job.ID) portMappings, err := m.apiClient.LoadRunnerPortMappings(*job.ID)
if err != nil { if err != nil {
environmentLogger.WithError(err).Warn("Error loading runner portMappings, skipping ...") return nil, false, fmt.Errorf("error loading runner portMappings: %w", err)
return
} }
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.onRunnerDestroyed) newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.onRunnerDestroyed)
log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner") log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner")
if isUsed { if isUsed {
newUsedRunners.Add(newJob.ID(), newJob)
timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey]) timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey])
if err != nil { if err != nil {
environmentLogger.WithError(err).Warn("Error loading timeout from meta values") log.WithField(dto.KeyRunnerID, newJob.ID()).WithError(err).Warn("failed loading timeout from meta values")
} else { timeout = int(nomad.RunnerTimeoutFallback.Seconds())
newJob.SetupTimeout(time.Duration(timeout) * time.Second) go m.markRunnerAsUsed(newJob, timeout)
} }
newJob.SetupTimeout(time.Duration(timeout) * time.Second)
} else { } else {
environment.AddRunner(newJob) environment.AddRunner(newJob)
} }
return newJob, isUsed, nil
} }
func (m *NomadRunnerManager) updateUsedRunners(newUsedRunners storage.Storage[Runner]) { // updateUsedRunners handles the cleanup process of updating the used runner storage.
// This includes the clean deletion of the local references to the (replaced/deleted) runners.
// Only if removeDeleted is set, the runners that are only in newUsedRunners (and not in the main m.usedRunners) will be removed.
func (m *NomadRunnerManager) updateUsedRunners(newUsedRunners storage.Storage[Runner], removeDeleted bool) {
for _, r := range m.usedRunners.List() { for _, r := range m.usedRunners.List() {
var reason DestroyReason var reason DestroyReason
if _, ok := newUsedRunners.Get(r.ID()); ok { if _, ok := newUsedRunners.Get(r.ID()); ok {
reason = ErrDestroyedAndReplaced reason = ErrDestroyedAndReplaced
} else { } else if removeDeleted {
reason = ErrLocalDestruction reason = ErrLocalDestruction
log.WithError(reason).WithField(dto.KeyRunnerID, r.ID()).Warn("Local runner cannot be recovered") log.WithError(reason).WithField(dto.KeyRunnerID, r.ID()).Warn("Local runner cannot be recovered")
} }
m.usedRunners.Delete(r.ID()) if reason != nil {
if err := r.Destroy(reason); err != nil { m.usedRunners.Delete(r.ID())
log.WithError(err).WithField(dto.KeyRunnerID, r.ID()).Warn("failed to destroy runner locally") if err := r.Destroy(reason); err != nil {
log.WithError(err).WithField(dto.KeyRunnerID, r.ID()).Warn("failed to destroy runner locally")
}
} }
} }
@ -186,7 +257,7 @@ func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, start
return return
} }
environment, ok := m.environments.Get(environmentID.ToString()) environment, ok := m.GetEnvironment(environmentID)
if ok { if ok {
var mappedPorts []nomadApi.PortMapping var mappedPorts []nomadApi.PortMapping
if alloc.AllocatedResources != nil { if alloc.AllocatedResources != nil {
@ -227,7 +298,7 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error)
} }
} }
environment, ok := m.environments.Get(environmentID.ToString()) environment, ok := m.GetEnvironment(environmentID)
if ok { if ok {
stillActive = stillActive || environment.DeleteRunner(runnerID) stillActive = stillActive || environment.DeleteRunner(runnerID)
} }
@ -240,7 +311,7 @@ func (m *NomadRunnerManager) onAllocationStopped(runnerID string, reason error)
func (m *NomadRunnerManager) onRunnerDestroyed(r Runner) error { func (m *NomadRunnerManager) onRunnerDestroyed(r Runner) error {
m.usedRunners.Delete(r.ID()) m.usedRunners.Delete(r.ID())
environment, ok := m.environments.Get(r.Environment().ToString()) environment, ok := m.GetEnvironment(r.Environment())
if ok { if ok {
environment.DeleteRunner(r.ID()) environment.DeleteRunner(r.ID())
} }

View File

@ -3,6 +3,7 @@ package runner
import ( import (
"context" "context"
nomadApi "github.com/hashicorp/nomad/api" nomadApi "github.com/hashicorp/nomad/api"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/nomad" "github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/pkg/dto" "github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/storage" "github.com/openHPI/poseidon/pkg/storage"
@ -533,13 +534,12 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
s.ExpectedGoroutingIncrease++ // We dont care about destroying the created runner. s.ExpectedGoroutingIncrease++ // We dont care about destroying the created runner.
call.Return([]*nomadApi.Job{job}, nil) call.Return([]*nomadApi.Job{job}, nil)
err := runnerManager.load() runnerManager.load()
s.NoError(err)
environmentMock.AssertExpectations(s.T()) environmentMock.AssertExpectations(s.T())
}) })
s.Run("Stores used runner", func() { s.Run("Stores used runner", func() {
apiMock.On("MarkRunnerAsUsed", mock.AnythingOfType("string"), mock.AnythingOfType("int")).Return(nil)
_, job := helpers.CreateTemplateJob() _, job := helpers.CreateTemplateJob()
jobID := tests.DefaultRunnerID jobID := tests.DefaultRunnerID
job.ID = &jobID job.ID = &jobID
@ -547,14 +547,11 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName) configTaskGroup := nomad.FindTaskGroup(job, nomad.ConfigTaskGroupName)
s.Require().NotNil(configTaskGroup) s.Require().NotNil(configTaskGroup)
configTaskGroup.Meta[nomad.ConfigMetaUsedKey] = nomad.ConfigMetaUsedValue configTaskGroup.Meta[nomad.ConfigMetaUsedKey] = nomad.ConfigMetaUsedValue
s.ExpectedGoroutingIncrease++ // We dont care about destroying the created runner. s.ExpectedGoroutingIncrease++ // We don't care about destroying the created runner.
call.Return([]*nomadApi.Job{job}, nil) call.Return([]*nomadApi.Job{job}, nil)
s.Require().Zero(runnerManager.usedRunners.Length()) s.Require().Zero(runnerManager.usedRunners.Length())
runnerManager.load()
err := runnerManager.load()
s.NoError(err)
_, ok := runnerManager.usedRunners.Get(tests.DefaultRunnerID) _, ok := runnerManager.usedRunners.Get(tests.DefaultRunnerID)
s.True(ok) s.True(ok)
}) })
@ -576,10 +573,7 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
call.Return([]*nomadApi.Job{job}, nil) call.Return([]*nomadApi.Job{job}, nil)
s.Require().Zero(runnerManager.usedRunners.Length()) s.Require().Zero(runnerManager.usedRunners.Length())
runnerManager.load()
err := runnerManager.load()
s.NoError(err)
s.Require().NotZero(runnerManager.usedRunners.Length()) s.Require().NotZero(runnerManager.usedRunners.Length())
<-time.After(time.Duration(timeout*2) * time.Second) <-time.After(time.Duration(timeout*2) * time.Second)
@ -587,6 +581,132 @@ func (s *MainTestSuite) TestNomadRunnerManager_Load() {
}) })
} }
func (s *MainTestSuite) TestNomadRunnerManager_checkPrewarmingPoolAlert() {
timeout := uint(1)
config.Config.Server.Alert.PrewarmingPoolReloadTimeout = timeout
config.Config.Server.Alert.PrewarmingPoolThreshold = 0.5
environment := &ExecutionEnvironmentMock{}
environment.On("ID").Return(dto.EnvironmentID(tests.DefaultEnvironmentIDAsInteger))
environment.On("Image").Return("")
environment.On("CPULimit").Return(uint(0))
environment.On("MemoryLimit").Return(uint(0))
environment.On("NetworkAccess").Return(false, nil)
apiMock := &nomad.ExecutorAPIMock{}
m := NewNomadRunnerManager(apiMock, s.TestCtx)
m.StoreEnvironment(environment)
s.Run("does not allow concurrent calls", func() {
environment.On("PrewarmingPoolSize").Return(uint(1)).Once()
secondCallDone := make(chan struct{})
environment.On("IdleRunnerCount").Run(func(_ mock.Arguments) {
<-secondCallDone
}).Return(uint(1)).Once()
go m.checkPrewarmingPoolAlert(environment)
<-time.After(tests.ShortTimeout)
go func() {
m.checkPrewarmingPoolAlert(environment)
close(secondCallDone)
}()
<-time.After(tests.ShortTimeout)
environment.AssertExpectations(s.T())
})
s.Run("checks the alert condition again after the reload timeout", func() {
environment.On("PrewarmingPoolSize").Return(uint(1)).Once()
environment.On("IdleRunnerCount").Return(uint(0)).Once()
environment.On("PrewarmingPoolSize").Return(uint(1)).Once()
environment.On("IdleRunnerCount").Return(uint(1)).Once()
checkDone := make(chan struct{})
go func() {
m.checkPrewarmingPoolAlert(environment)
close(checkDone)
}()
select {
case <-checkDone:
s.Fail("checkPrewarmingPoolAlert returned before the reload timeout")
case <-time.After(time.Duration(timeout) * time.Second / 2):
}
select {
case <-time.After(time.Duration(timeout) * time.Second):
s.Fail("checkPrewarmingPoolAlert did not return after checking the alert condition again")
case <-checkDone:
}
environment.AssertExpectations(s.T())
})
s.Run("checks the alert condition again after the reload timeout", func() {
environment.On("PrewarmingPoolSize").Return(uint(1)).Twice()
environment.On("IdleRunnerCount").Return(uint(0)).Twice()
apiMock.On("LoadRunnerJobs", environment.ID()).Return([]*nomadApi.Job{}, nil).Once()
environment.On("ApplyPrewarmingPoolSize").Return(nil).Once()
checkDone := make(chan struct{})
go func() {
m.checkPrewarmingPoolAlert(environment)
close(checkDone)
}()
select {
case <-time.After(time.Duration(timeout) * time.Second * 2):
s.Fail("checkPrewarmingPoolAlert did not return")
case <-checkDone:
}
environment.AssertExpectations(s.T())
})
}
func (s *MainTestSuite) TestNomadRunnerManager_checkPrewarmingPoolAlert_reloadsRunners() {
config.Config.Server.Alert.PrewarmingPoolReloadTimeout = uint(1)
config.Config.Server.Alert.PrewarmingPoolThreshold = 0.5
environment := &ExecutionEnvironmentMock{}
environment.On("ID").Return(dto.EnvironmentID(tests.DefaultEnvironmentIDAsInteger))
environment.On("Image").Return("")
environment.On("CPULimit").Return(uint(0))
environment.On("MemoryLimit").Return(uint(0))
environment.On("NetworkAccess").Return(false, nil)
apiMock := &nomad.ExecutorAPIMock{}
m := NewNomadRunnerManager(apiMock, s.TestCtx)
m.StoreEnvironment(environment)
environment.On("PrewarmingPoolSize").Return(uint(1)).Twice()
environment.On("IdleRunnerCount").Return(uint(0)).Twice()
environment.On("DeleteRunner", mock.Anything).Return(false).Once()
s.Require().Empty(m.usedRunners.Length())
_, usedJob := helpers.CreateTemplateJob()
id := tests.DefaultRunnerID
usedJob.ID = &id
configTaskGroup := nomad.FindTaskGroup(usedJob, nomad.ConfigTaskGroupName)
configTaskGroup.Meta[nomad.ConfigMetaUsedKey] = nomad.ConfigMetaUsedValue
configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey] = "42"
_, idleJob := helpers.CreateTemplateJob()
idleID := tests.AnotherRunnerID
idleJob.ID = &idleID
nomad.FindTaskGroup(idleJob, nomad.ConfigTaskGroupName).Meta[nomad.ConfigMetaUsedKey] = nomad.ConfigMetaUnusedValue
apiMock.On("LoadRunnerJobs", environment.ID()).Return([]*nomadApi.Job{usedJob, idleJob}, nil).Once()
apiMock.On("LoadRunnerPortMappings", mock.Anything).Return(nil, nil).Twice()
environment.On("ApplyPrewarmingPoolSize").Return(nil).Once()
environment.On("AddRunner", mock.Anything).Run(func(args mock.Arguments) {
job, ok := args[0].(*NomadJob)
s.Require().True(ok)
err := job.Destroy(ErrLocalDestruction)
s.NoError(err)
}).Return().Once()
m.checkPrewarmingPoolAlert(environment)
r, ok := m.usedRunners.Get(tests.DefaultRunnerID)
s.Require().True(ok)
err := r.Destroy(ErrLocalDestruction)
s.NoError(err)
environment.AssertExpectations(s.T())
}
func mockWatchAllocations(ctx context.Context, apiMock *nomad.ExecutorAPIMock) { func mockWatchAllocations(ctx context.Context, apiMock *nomad.ExecutorAPIMock) {
call := apiMock.On("WatchEventStream", mock.Anything, mock.Anything, mock.Anything) call := apiMock.On("WatchEventStream", mock.Anything, mock.Anything, mock.Anything)
call.Run(func(args mock.Arguments) { call.Run(func(args mock.Arguments) {