Monitor Nomad allocation startup duration.

This commit is contained in:
Maximilian Paß
2022-07-17 17:56:04 +02:00
parent 18daa1152c
commit c6e65c14bb
8 changed files with 102 additions and 63 deletions

View File

@@ -7,7 +7,6 @@ import (
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage"
"strconv"
)
var ErrNullObject = errors.New("functionality not available for the null object")
@@ -33,7 +32,7 @@ func NewAbstractManager() *AbstractManager {
// MonitorRunnersEnvironmentID passes the id of the environment e into the monitoring Point p.
func MonitorRunnersEnvironmentID(p *write.Point, e Runner, isDeletion bool) {
if !isDeletion && e != nil {
p.AddTag(monitoring.InfluxKeyEnvironmentID, strconv.Itoa(int(e.Environment())))
p.AddTag(monitoring.InfluxKeyEnvironmentID, e.Environment().ToString())
}
}

View File

@@ -5,9 +5,11 @@ import (
"errors"
"fmt"
nomadApi "github.com/hashicorp/nomad/api"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
"github.com/openHPI/poseidon/internal/nomad"
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/logging"
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/sirupsen/logrus"
"strconv"
"time"
@@ -118,15 +120,16 @@ func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger
func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
retries := 0
for ctx.Err() == nil {
err := m.apiClient.WatchEventStream(ctx, m.onAllocationAdded, m.onAllocationStopped)
err := m.apiClient.WatchEventStream(ctx,
&nomad.AllocationProcessoring{OnNew: m.onAllocationAdded, OnDeleted: m.onAllocationStopped})
retries += 1
log.WithError(err).Errorf("Stopped updating the runners! Retry %v", retries)
<-time.After(time.Second)
}
}
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation) {
log.WithField("id", alloc.JobID).Debug("Runner started")
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, startup time.Duration) {
log.WithField("id", alloc.JobID).WithField("startupDuration", startup).Debug("Runner started")
if nomad.IsEnvironmentTemplateID(alloc.JobID) {
return
@@ -145,9 +148,18 @@ func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation) {
mappedPorts = alloc.AllocatedResources.Shared.Ports
}
environment.AddRunner(NewNomadJob(alloc.JobID, mappedPorts, m.apiClient, m.Return))
monitorAllocationStartupDuration(startup, alloc.JobID, environmentID)
}
}
func monitorAllocationStartupDuration(startup time.Duration, runnerID string, environmentID dto.EnvironmentID) {
p := influxdb2.NewPointWithMeasurement(monitoring.MeasurementIdleRunnerNomad)
p.AddField(monitoring.InfluxKeyDuration, startup)
p.AddTag(monitoring.InfluxKeyEnvironmentID, environmentID.ToString())
p.AddTag(monitoring.InfluxKeyRunnerID, runnerID)
monitoring.WriteInfluxPoint(p)
}
func (m *NomadRunnerManager) onAllocationStopped(alloc *nomadApi.Allocation) {
log.WithField("id", alloc.JobID).Debug("Runner stopped")

View File

@@ -227,9 +227,9 @@ func (s *ManagerTestSuite) TestUpdateRunnersAddsIdleRunner() {
modifyMockedCall(s.apiMock, "WatchEventStream", func(call *mock.Call) {
call.Run(func(args mock.Arguments) {
onCreate, ok := args.Get(1).(nomad.AllocationProcessor)
callbacks, ok := args.Get(1).(*nomad.AllocationProcessoring)
s.Require().True(ok)
onCreate(allocation)
callbacks.OnNew(allocation, 0)
call.ReturnArguments = mock.Arguments{nil}
})
})
@@ -255,9 +255,9 @@ func (s *ManagerTestSuite) TestUpdateRunnersRemovesIdleAndUsedRunner() {
modifyMockedCall(s.apiMock, "WatchEventStream", func(call *mock.Call) {
call.Run(func(args mock.Arguments) {
onDelete, ok := args.Get(2).(nomad.AllocationProcessor)
callbacks, ok := args.Get(1).(*nomad.AllocationProcessoring)
s.Require().True(ok)
onDelete(allocation)
callbacks.OnDeleted(allocation)
call.ReturnArguments = mock.Arguments{nil}
})
})
@@ -288,7 +288,7 @@ func (s *ManagerTestSuite) TestOnAllocationAdded() {
mockIdleRunners(environment.(*ExecutionEnvironmentMock))
alloc := &nomadApi.Allocation{JobID: nomad.TemplateJobID(tests.DefaultEnvironmentIDAsInteger)}
s.nomadRunnerManager.onAllocationAdded(alloc)
s.nomadRunnerManager.onAllocationAdded(alloc, 0)
_, ok = environment.Sample()
s.False(ok)
@@ -296,7 +296,7 @@ func (s *ManagerTestSuite) TestOnAllocationAdded() {
s.Run("does not panic when environment id cannot be parsed", func() {
alloc := &nomadApi.Allocation{JobID: ""}
s.NotPanics(func() {
s.nomadRunnerManager.onAllocationAdded(alloc)
s.nomadRunnerManager.onAllocationAdded(alloc, 0)
})
})
s.Run("does not panic when environment does not exist", func() {
@@ -306,7 +306,7 @@ func (s *ManagerTestSuite) TestOnAllocationAdded() {
alloc := &nomadApi.Allocation{JobID: nomad.RunnerJobID(nonExistentEnvironment, "1-1-1-1")}
s.NotPanics(func() {
s.nomadRunnerManager.onAllocationAdded(alloc)
s.nomadRunnerManager.onAllocationAdded(alloc, 0)
})
})
s.Run("adds correct job", func() {
@@ -319,7 +319,7 @@ func (s *ManagerTestSuite) TestOnAllocationAdded() {
JobID: tests.DefaultRunnerID,
AllocatedResources: nil,
}
s.nomadRunnerManager.onAllocationAdded(alloc)
s.nomadRunnerManager.onAllocationAdded(alloc, 0)
runner, ok := environment.Sample()
s.True(ok)
@@ -339,7 +339,7 @@ func (s *ManagerTestSuite) TestOnAllocationAdded() {
Shared: nomadApi.AllocatedSharedResources{Ports: tests.DefaultPortMappings},
},
}
s.nomadRunnerManager.onAllocationAdded(alloc)
s.nomadRunnerManager.onAllocationAdded(alloc, 0)
runner, ok := environment.Sample()
s.True(ok)

View File

@@ -7,7 +7,6 @@ import (
"github.com/openHPI/poseidon/pkg/monitoring"
"github.com/openHPI/poseidon/pkg/storage"
"io"
"strconv"
)
type ExitInfo struct {
@@ -69,7 +68,7 @@ func monitorExecutionsRunnerID(env dto.EnvironmentID, runnerID string) storage.W
return func(p *write.Point, e *dto.ExecutionRequest, isDeletion bool) {
if !isDeletion && e != nil {
p.AddTag(monitoring.InfluxKeyRunnerID, runnerID)
p.AddTag(monitoring.InfluxKeyEnvironmentID, strconv.Itoa(int(env)))
p.AddTag(monitoring.InfluxKeyEnvironmentID, env.ToString())
}
}
}