From 1239699e74c4adba6426a58504c380d6236930ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Pa=C3=9F?= <22845248+mpass99@users.noreply.github.com> Date: Thu, 23 Dec 2021 13:10:55 +0100 Subject: [PATCH] Add a warning when allocations fail (#83) * Log a warning when an allocation fails * Restructure allocation event handling --- internal/nomad/nomad.go | 64 ++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/internal/nomad/nomad.go b/internal/nomad/nomad.go index 031d3e2..100d124 100644 --- a/internal/nomad/nomad.go +++ b/internal/nomad/nomad.go @@ -266,25 +266,61 @@ func handleAllocationEvent(startTime int64, pendingAllocations map[string]bool, return nil } - if alloc.ClientStatus == structs.AllocClientStatusRunning { - switch alloc.DesiredStatus { - case structs.AllocDesiredStatusStop: - onDeletedAllocation(alloc) - case structs.AllocDesiredStatusRun: - // is first event that marks the transition between pending and running? - _, ok := pendingAllocations[alloc.ID] - if ok { - onNewAllocation(alloc) - delete(pendingAllocations, alloc.ID) - } - } + switch alloc.ClientStatus { + case structs.AllocClientStatusPending: + handlePendingAllocationEvent(alloc, pendingAllocations) + case structs.AllocClientStatusRunning: + handleRunningAllocationEvent(alloc, pendingAllocations, onNewAllocation, onDeletedAllocation) + case structs.AllocClientStatusFailed: + handleFailedAllocationEvent(alloc) } + return nil +} - if alloc.ClientStatus == structs.AllocClientStatusPending && alloc.DesiredStatus == structs.AllocDesiredStatusRun { +// handlePendingAllocationEvent sets flag in pendingAllocations that can be used to filter following events. +func handlePendingAllocationEvent(alloc *nomadApi.Allocation, pendingAllocations map[string]bool) { + if alloc.DesiredStatus == structs.AllocDesiredStatusRun { // allocation is started, wait until it runs and add to our list afterwards pendingAllocations[alloc.ID] = true } - return nil +} + +// handleRunningAllocationEvent calls the passed AllocationProcessor filtering similar events. +func handleRunningAllocationEvent(alloc *nomadApi.Allocation, + pendingAllocations map[string]bool, onNewAllocation, onDeletedAllocation AllocationProcessor) { + switch alloc.DesiredStatus { + case structs.AllocDesiredStatusStop: + onDeletedAllocation(alloc) + case structs.AllocDesiredStatusRun: + // is first event that marks the transition between pending and running? + _, ok := pendingAllocations[alloc.ID] + if ok { + onNewAllocation(alloc) + delete(pendingAllocations, alloc.ID) + } + } +} + +// handleFailedAllocationEvent logs only the first of the multiple failure events. +func handleFailedAllocationEvent(alloc *nomadApi.Allocation) { + if alloc.FollowupEvalID == "" && alloc.PreviousAllocation == "" { + log.WithField("job", alloc.JobID). + WithField("reason", failureDisplayMessage(alloc)). + WithField("alloc", alloc). + Warn("Allocation failure") + } +} + +// failureDisplayMessage parses the DisplayMessage of a failed allocation. +func failureDisplayMessage(alloc *nomadApi.Allocation) (msg string) { + for _, state := range alloc.TaskStates { + for _, event := range state.Events { + if event.FailsTask { + return event.DisplayMessage + } + } + } + return "" } // checkEvaluation checks whether the given evaluation failed.