Always log Runner and Environment ID.
Systematically log the runner id and the environment id by adding the information at the findRunnerMiddleware.
This commit is contained in:
@ -76,9 +76,7 @@ func (r *RunnerController) provide(writer http.ResponseWriter, request *http.Req
|
|||||||
case errors.Is(err, runner.ErrUnknownExecutionEnvironment):
|
case errors.Is(err, runner.ErrUnknownExecutionEnvironment):
|
||||||
writeClientError(writer, err, http.StatusNotFound, request.Context())
|
writeClientError(writer, err, http.StatusNotFound, request.Context())
|
||||||
case errors.Is(err, runner.ErrNoRunnersAvailable):
|
case errors.Is(err, runner.ErrNoRunnersAvailable):
|
||||||
log.WithContext(request.Context()).
|
log.WithContext(request.Context()).Warn("No runners available")
|
||||||
WithField("environment", logging.RemoveNewlineSymbol(strconv.Itoa(int(environmentID)))).
|
|
||||||
Warn("No runners available")
|
|
||||||
writeInternalServerError(writer, err, dto.ErrorNomadOverload, request.Context())
|
writeInternalServerError(writer, err, dto.ErrorNomadOverload, request.Context())
|
||||||
default:
|
default:
|
||||||
writeInternalServerError(writer, err, dto.ErrorUnknown, request.Context())
|
writeInternalServerError(writer, err, dto.ErrorUnknown, request.Context())
|
||||||
@ -238,6 +236,8 @@ func (r *RunnerController) findRunnerMiddleware(next http.Handler) http.Handler
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
ctx := runner.NewContext(request.Context(), targetRunner)
|
ctx := runner.NewContext(request.Context(), targetRunner)
|
||||||
|
ctx = context.WithValue(ctx, dto.ContextKey(dto.KeyRunnerID), targetRunner.ID())
|
||||||
|
ctx = context.WithValue(ctx, dto.ContextKey(dto.KeyEnvironmentID), targetRunner.Environment().ToString())
|
||||||
requestWithRunner := request.WithContext(ctx)
|
requestWithRunner := request.WithContext(ctx)
|
||||||
next.ServeHTTP(writer, requestWithRunner)
|
next.ServeHTTP(writer, requestWithRunner)
|
||||||
})
|
})
|
||||||
|
@ -99,7 +99,7 @@ func (r *RunnerController) connectToRunner(writer http.ResponseWriter, request *
|
|||||||
defer cancelProxy()
|
defer cancelProxy()
|
||||||
proxy := newWebSocketProxy(connection, proxyCtx)
|
proxy := newWebSocketProxy(connection, proxyCtx)
|
||||||
|
|
||||||
log.WithContext(proxyCtx).WithField("runnerId", targetRunner.ID()).
|
log.WithContext(proxyCtx).
|
||||||
WithField("executionID", logging.RemoveNewlineSymbol(executionID)).
|
WithField("executionID", logging.RemoveNewlineSymbol(executionID)).
|
||||||
Info("Running execution")
|
Info("Running execution")
|
||||||
logging.StartSpan("api.runner.connect", "Execute Interactively", request.Context(), func(ctx context.Context) {
|
logging.StartSpan("api.runner.connect", "Execute Interactively", request.Context(), func(ctx context.Context) {
|
||||||
|
@ -249,11 +249,12 @@ func (n *NomadEnvironment) Sample() (runner.Runner, bool) {
|
|||||||
return n.createRunner(false)
|
return n.createRunner(false)
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).WithField("environmentID", n.ID()).Error("Couldn't create new runner for claimed one")
|
log.WithError(err).WithField(dto.KeyEnvironmentID, n.ID().ToString()).
|
||||||
|
Error("Couldn't create new runner for claimed one")
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
} else if ok {
|
} else if ok {
|
||||||
log.WithField("environment", n.ID().ToString()).Warn("Too many idle runner")
|
log.WithField(dto.KeyEnvironmentID, n.ID().ToString()).Warn("Too many idle runner")
|
||||||
}
|
}
|
||||||
return r, ok
|
return r, ok
|
||||||
}
|
}
|
||||||
@ -331,7 +332,7 @@ func parseJob(jobHCL string) (*nomadApi.Job, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (n *NomadEnvironment) createRunners(count uint, forcePull bool) error {
|
func (n *NomadEnvironment) createRunners(count uint, forcePull bool) error {
|
||||||
log.WithField("runnersRequired", count).WithField("id", n.ID()).Debug("Creating new runners")
|
log.WithField("runnersRequired", count).WithField(dto.KeyEnvironmentID, n.ID()).Debug("Creating new runners")
|
||||||
for i := 0; i < int(count); i++ {
|
for i := 0; i < int(count); i++ {
|
||||||
err := n.createRunner(forcePull)
|
err := n.createRunner(forcePull)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -132,10 +132,10 @@ func (nc *nomadAPIClient) Execute(runnerID string,
|
|||||||
case err == nil:
|
case err == nil:
|
||||||
return exitCode, nil
|
return exitCode, nil
|
||||||
case websocket.IsCloseError(errors.Unwrap(err), websocket.CloseNormalClosure):
|
case websocket.IsCloseError(errors.Unwrap(err), websocket.CloseNormalClosure):
|
||||||
log.WithContext(ctx).WithField("runnerID", runnerID).WithError(err).Info("The exit code could not be received.")
|
log.WithContext(ctx).WithError(err).Info("The exit code could not be received.")
|
||||||
return 0, nil
|
return 0, nil
|
||||||
case errors.Is(err, context.Canceled):
|
case errors.Is(err, context.Canceled):
|
||||||
log.WithContext(ctx).WithField("runnerID", runnerID).Debug("Execution canceled by context")
|
log.WithContext(ctx).Debug("Execution canceled by context")
|
||||||
return 0, nil
|
return 0, nil
|
||||||
default:
|
default:
|
||||||
return 1, fmt.Errorf("error executing command in allocation: %w", err)
|
return 1, fmt.Errorf("error executing command in allocation: %w", err)
|
||||||
|
@ -596,7 +596,7 @@ func (a *APIClient) executeCommandInteractivelyWithStderr(allocationID string, c
|
|||||||
exit, err := a.Execute(allocationID, ctx, prepareCommandTTYStdErr(currentNanoTime, privilegedExecution), true,
|
exit, err := a.Execute(allocationID, ctx, prepareCommandTTYStdErr(currentNanoTime, privilegedExecution), true,
|
||||||
nullio.Reader{Ctx: readingContext}, stderr, io.Discard)
|
nullio.Reader{Ctx: readingContext}, stderr, io.Discard)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithContext(ctx).WithError(err).WithField("runner", allocationID).Warn("Stderr task finished with error")
|
log.WithContext(ctx).WithError(err).Warn("Stderr task finished with error")
|
||||||
}
|
}
|
||||||
stderrExitChan <- exit
|
stderrExitChan <- exit
|
||||||
})
|
})
|
||||||
|
@ -2,6 +2,7 @@ package runner
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
"github.com/openHPI/poseidon/pkg/dto"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -70,9 +71,10 @@ func (t *InactivityTimerImplementation) SetupTimeout(duration time.Duration) {
|
|||||||
t.mu.Unlock()
|
t.mu.Unlock()
|
||||||
err := t.onDestroy(t.runner)
|
err := t.onDestroy(t.runner)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).WithField("id", t.runner.ID()).Warn("Returning runner after inactivity caused an error")
|
log.WithError(err).WithField(dto.KeyRunnerID, t.runner.ID()).
|
||||||
|
Warn("Returning runner after inactivity caused an error")
|
||||||
} else {
|
} else {
|
||||||
log.WithField("id", t.runner.ID()).Info("Returning runner due to inactivity timeout")
|
log.WithField(dto.KeyRunnerID, t.runner.ID()).Info("Returning runner due to inactivity timeout")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -64,7 +64,7 @@ func (m *NomadRunnerManager) markRunnerAsUsed(runner Runner, timeoutDuration int
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
err := m.Return(runner)
|
err := m.Return(runner)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithError(err).WithField("runnerID", runner.ID()).Error("can't mark runner as used and can't return runner")
|
log.WithError(err).WithField(dto.KeyRunnerID, runner.ID()).Error("can't mark runner as used and can't return runner")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -86,7 +86,7 @@ func (m *NomadRunnerManager) Return(r Runner) error {
|
|||||||
|
|
||||||
func (m *NomadRunnerManager) Load() {
|
func (m *NomadRunnerManager) Load() {
|
||||||
for _, environment := range m.environments.List() {
|
for _, environment := range m.environments.List() {
|
||||||
environmentLogger := log.WithField("environmentID", environment.ID())
|
environmentLogger := log.WithField(dto.KeyEnvironmentID, environment.ID().ToString())
|
||||||
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID())
|
runnerJobs, err := m.apiClient.LoadRunnerJobs(environment.ID())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
environmentLogger.WithError(err).Warn("Error fetching the runner jobs")
|
environmentLogger.WithError(err).Warn("Error fetching the runner jobs")
|
||||||
@ -115,7 +115,7 @@ func (m *NomadRunnerManager) loadSingleJob(job *nomadApi.Job, environmentLogger
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.Return)
|
newJob := NewNomadJob(*job.ID, portMappings, m.apiClient, m.Return)
|
||||||
log.WithField("isUsed", isUsed).WithField("runner_id", newJob.ID()).Debug("Recovered Runner")
|
log.WithField("isUsed", isUsed).WithField(dto.KeyRunnerID, newJob.ID()).Debug("Recovered Runner")
|
||||||
if isUsed {
|
if isUsed {
|
||||||
m.usedRunners.Add(newJob.ID(), newJob)
|
m.usedRunners.Add(newJob.ID(), newJob)
|
||||||
timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey])
|
timeout, err := strconv.Atoi(configTaskGroup.Meta[nomad.ConfigMetaTimeoutKey])
|
||||||
@ -141,14 +141,15 @@ func (m *NomadRunnerManager) keepRunnersSynced(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, startup time.Duration) {
|
func (m *NomadRunnerManager) onAllocationAdded(alloc *nomadApi.Allocation, startup time.Duration) {
|
||||||
log.WithField("id", alloc.JobID).WithField("startupDuration", startup).Debug("Runner started")
|
log.WithField(dto.KeyRunnerID, alloc.JobID).WithField("startupDuration", startup).Debug("Runner started")
|
||||||
|
|
||||||
if nomad.IsEnvironmentTemplateID(alloc.JobID) {
|
if nomad.IsEnvironmentTemplateID(alloc.JobID) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, ok := m.usedRunners.Get(alloc.JobID); ok {
|
if _, ok := m.usedRunners.Get(alloc.JobID); ok {
|
||||||
log.WithField("id", alloc.JobID).WithField("states", alloc.TaskStates).Error("Started Runner is already in use")
|
log.WithField(dto.KeyRunnerID, alloc.JobID).WithField("states", alloc.TaskStates).
|
||||||
|
Error("Started Runner is already in use")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,7 +179,7 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
|
func (m *NomadRunnerManager) onAllocationStopped(runnerID string) (alreadyRemoved bool) {
|
||||||
log.WithField("id", runnerID).Debug("Runner stopped")
|
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
|
||||||
|
|
||||||
if nomad.IsEnvironmentTemplateID(runnerID) {
|
if nomad.IsEnvironmentTemplateID(runnerID) {
|
||||||
return false
|
return false
|
||||||
|
@ -201,7 +201,7 @@ func (r *NomadJob) GetFileContent(
|
|||||||
p.AddTag(monitoring.InfluxKeyRunnerID, r.ID())
|
p.AddTag(monitoring.InfluxKeyRunnerID, r.ID())
|
||||||
environmentID, err := nomad.EnvironmentIDFromRunnerID(r.ID())
|
environmentID, err := nomad.EnvironmentIDFromRunnerID(r.ID())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithContext(ctx).WithField("runnerID", r.ID()).WithError(err).Warn("can not parse environment id")
|
log.WithContext(ctx).WithError(err).Warn("can not parse environment id")
|
||||||
}
|
}
|
||||||
p.AddTag(monitoring.InfluxKeyEnvironmentID, environmentID.ToString())
|
p.AddTag(monitoring.InfluxKeyEnvironmentID, environmentID.ToString())
|
||||||
defer contentLengthWriter.SendMonitoringData(p)
|
defer contentLengthWriter.SendMonitoringData(p)
|
||||||
@ -283,16 +283,16 @@ func (r *NomadJob) handleExitOrContextDone(ctx context.Context, cancelExecute co
|
|||||||
// log.WithField("runner", r.id).Warn("Could not send SIGQUIT because nothing was written")
|
// log.WithField("runner", r.id).Warn("Could not send SIGQUIT because nothing was written")
|
||||||
// }
|
// }
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.WithContext(ctx).WithField("runner", r.id).WithError(err).Warn("Could not send SIGQUIT due to error")
|
log.WithContext(ctx).WithError(err).Warn("Could not send SIGQUIT due to error")
|
||||||
}
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-exitInternal:
|
case <-exitInternal:
|
||||||
log.WithContext(ctx).WithField("runner", r.id).Debug("Execution terminated after SIGQUIT")
|
log.WithContext(ctx).Debug("Execution terminated after SIGQUIT")
|
||||||
case <-time.After(executionTimeoutGracePeriod):
|
case <-time.After(executionTimeoutGracePeriod):
|
||||||
log.WithContext(ctx).WithField("runner", r.id).Info("Execution did not quit after SIGQUIT")
|
log.WithContext(ctx).Info("Execution did not quit after SIGQUIT")
|
||||||
if err := r.Destroy(); err != nil {
|
if err := r.Destroy(); err != nil {
|
||||||
log.WithContext(ctx).WithField("runner", r.id).Error("Error when destroying runner")
|
log.WithContext(ctx).Error("Error when destroying runner")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -184,6 +184,12 @@ func (f File) ByteContent() []byte {
|
|||||||
// ContextKey is the type for keys in a request context that is used for passing data to the next handler.
|
// ContextKey is the type for keys in a request context that is used for passing data to the next handler.
|
||||||
type ContextKey string
|
type ContextKey string
|
||||||
|
|
||||||
|
// Keys to reference information (for logging or monitoring).
|
||||||
|
const (
|
||||||
|
KeyRunnerID = "runner_id"
|
||||||
|
KeyEnvironmentID = "environment_id"
|
||||||
|
)
|
||||||
|
|
||||||
// WebSocketMessageType is the type for the messages from Poseidon to the client.
|
// WebSocketMessageType is the type for the messages from Poseidon to the client.
|
||||||
type WebSocketMessageType string
|
type WebSocketMessageType string
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ package logging
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"github.com/getsentry/sentry-go"
|
"github.com/getsentry/sentry-go"
|
||||||
|
"github.com/openHPI/poseidon/pkg/dto"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -28,6 +29,9 @@ func (hook *SentryHook) Fire(entry *logrus.Entry) error {
|
|||||||
var hub *sentry.Hub
|
var hub *sentry.Hub
|
||||||
if entry.Context != nil {
|
if entry.Context != nil {
|
||||||
hub = sentry.GetHubFromContext(entry.Context)
|
hub = sentry.GetHubFromContext(entry.Context)
|
||||||
|
// This might overwrite valid data when not the request context is passed.
|
||||||
|
entry.Data[dto.KeyRunnerID] = entry.Context.Value(dto.ContextKey(dto.KeyRunnerID))
|
||||||
|
entry.Data[dto.KeyEnvironmentID] = entry.Context.Value(dto.ContextKey(dto.KeyEnvironmentID))
|
||||||
}
|
}
|
||||||
if hub == nil {
|
if hub == nil {
|
||||||
hub = sentry.CurrentHub()
|
hub = sentry.CurrentHub()
|
||||||
|
@ -34,8 +34,8 @@ const (
|
|||||||
|
|
||||||
// The keys for the monitored tags and fields.
|
// The keys for the monitored tags and fields.
|
||||||
|
|
||||||
InfluxKeyRunnerID = "runner_id"
|
InfluxKeyRunnerID = dto.KeyRunnerID
|
||||||
InfluxKeyEnvironmentID = "environment_id"
|
InfluxKeyEnvironmentID = dto.KeyEnvironmentID
|
||||||
InfluxKeyJobID = "job_id"
|
InfluxKeyJobID = "job_id"
|
||||||
InfluxKeyClientStatus = "client_status"
|
InfluxKeyClientStatus = "client_status"
|
||||||
InfluxKeyNomadNode = "nomad_agent"
|
InfluxKeyNomadNode = "nomad_agent"
|
||||||
|
Reference in New Issue
Block a user