Introduce reason for destroying runner

in order to return a specific error for OOM Killed Executions.
This commit is contained in:
Maximilian Paß
2023-06-28 18:24:50 +01:00
committed by Sebastian Serth
parent b3fedf274c
commit 6a1677dea0
15 changed files with 186 additions and 56 deletions

View File

@@ -196,3 +196,20 @@ func partOfJobID(id string, part uint) (dto.EnvironmentID, error) {
}
return dto.EnvironmentID(environmentID), nil
}
func isOOMKilled(alloc *nomadApi.Allocation) bool {
state, ok := alloc.TaskStates[TaskName]
if !ok {
return false
}
var oomKilledCount uint64
for _, event := range state.Events {
if oomString, ok := event.Details["oom_killed"]; ok {
if oom, err := strconv.ParseBool(oomString); err == nil && oom {
oomKilledCount++
}
}
}
return oomKilledCount >= state.Restarts
}

View File

@@ -141,3 +141,18 @@ func TestEnvironmentIDFromRunnerID(t *testing.T) {
_, err = EnvironmentIDFromRunnerID("")
assert.Error(t, err)
}
func TestOOMKilledAllocation(t *testing.T) {
event := nomadApi.TaskEvent{Details: map[string]string{"oom_killed": "true"}}
state := nomadApi.TaskState{Restarts: 2, Events: []*nomadApi.TaskEvent{&event}}
alloc := nomadApi.Allocation{TaskStates: map[string]*nomadApi.TaskState{TaskName: &state}}
assert.False(t, isOOMKilled(&alloc))
event2 := nomadApi.TaskEvent{Details: map[string]string{"oom_killed": "false"}}
alloc.TaskStates[TaskName].Events = []*nomadApi.TaskEvent{&event, &event2}
assert.False(t, isOOMKilled(&alloc))
event3 := nomadApi.TaskEvent{Details: map[string]string{"oom_killed": "true"}}
alloc.TaskStates[TaskName].Events = []*nomadApi.TaskEvent{&event, &event2, &event3}
assert.True(t, isOOMKilled(&alloc))
}

View File

@@ -21,12 +21,21 @@ import (
)
var (
log = logging.GetLogger("nomad")
ErrorExecutorCommunicationFailed = errors.New("communication with executor failed")
ErrorEvaluation = errors.New("evaluation could not complete")
ErrorPlacingAllocations = errors.New("failed to place all allocations")
ErrorLoadingJob = errors.New("failed to load job")
ErrorNoAllocatedResourcesFound = errors.New("no allocated resources found")
log = logging.GetLogger("nomad")
ErrorExecutorCommunicationFailed = errors.New("communication with executor failed")
ErrorEvaluation = errors.New("evaluation could not complete")
ErrorPlacingAllocations = errors.New("failed to place all allocations")
ErrorLoadingJob = errors.New("failed to load job")
ErrorNoAllocatedResourcesFound = errors.New("no allocated resources found")
ErrorOOMKilled RunnerDeletedReason = errors.New("the allocation was OOM Killed")
ErrorAllocationRescheduled RunnerDeletedReason = errors.New("the allocation was rescheduled")
ErrorAllocationStopped RunnerDeletedReason = errors.New("the allocation was stopped")
ErrorAllocationStoppedUnexpectedly RunnerDeletedReason = fmt.Errorf("%w unexpectedly", ErrorAllocationStopped)
ErrorAllocationRescheduledUnexpectedly RunnerDeletedReason = fmt.Errorf(
"%w correctly but rescheduled", ErrorAllocationStopped)
// ErrorAllocationCompleted is for reporting the reason for the stopped allocation.
// We do not consider it as an error but add it anyway for a complete reporting.
ErrorAllocationCompleted RunnerDeletedReason = errors.New("the allocation completed")
)
// resultChannelWriteTimeout is to detect the error when more element are written into a channel than expected.
@@ -37,7 +46,8 @@ type AllocationProcessing struct {
OnNew NewAllocationProcessor
OnDeleted DeletedAllocationProcessor
}
type DeletedAllocationProcessor func(jobID string) (removedByPoseidon bool)
type RunnerDeletedReason error
type DeletedAllocationProcessor func(jobID string, RunnerDeletedReason error) (removedByPoseidon bool)
type NewAllocationProcessor func(*nomadApi.Allocation, time.Duration)
type allocationData struct {
@@ -191,7 +201,7 @@ func (a *APIClient) MonitorEvaluation(evaluationID string, ctx context.Context)
go func() {
err = a.WatchEventStream(ctx, &AllocationProcessing{
OnNew: func(_ *nomadApi.Allocation, _ time.Duration) {},
OnDeleted: func(_ string) bool { return false },
OnDeleted: func(_ string, _ error) bool { return false },
})
cancel() // cancel the waiting for an evaluation result if watching the event stream ends.
}()
@@ -411,11 +421,15 @@ func handlePendingAllocationEvent(alloc *nomadApi.Allocation, allocData *allocat
case structs.AllocDesiredStatusRun:
if allocData != nil {
// Handle Allocation restart.
callbacks.OnDeleted(alloc.JobID)
var reason error
if isOOMKilled(alloc) {
reason = ErrorOOMKilled
}
callbacks.OnDeleted(alloc.JobID, reason)
} else if alloc.PreviousAllocation != "" {
// Handle Runner (/Container) re-allocations.
if prevData, ok := allocations.Get(alloc.PreviousAllocation); ok {
if removedByPoseidon := callbacks.OnDeleted(prevData.jobID); removedByPoseidon {
if removedByPoseidon := callbacks.OnDeleted(prevData.jobID, ErrorAllocationRescheduled); removedByPoseidon {
// This case handles a race condition between the overdue runner inactivity timeout and the rescheduling of a
// lost allocation. The race condition leads first to the rescheduling of the runner, but right after to it
// being stopped. Instead of reporting an unexpected stop of the pending runner, we just not start tracking it.
@@ -466,7 +480,7 @@ func handleCompleteAllocationEvent(alloc *nomadApi.Allocation, _ *allocationData
case structs.AllocDesiredStatusRun:
log.WithField("alloc", alloc).Warn("Complete allocation desires to run")
case structs.AllocDesiredStatusStop:
callbacks.OnDeleted(alloc.JobID)
callbacks.OnDeleted(alloc.JobID, ErrorAllocationCompleted)
allocations.Delete(alloc.ID)
default:
log.WithField("alloc", alloc).Warn("Other Desired Status")
@@ -477,32 +491,40 @@ func handleCompleteAllocationEvent(alloc *nomadApi.Allocation, _ *allocationData
func handleFailedAllocationEvent(alloc *nomadApi.Allocation, allocData *allocationData,
allocations storage.Storage[*allocationData], callbacks *AllocationProcessing) {
// The stop is expected when the allocation desired to stop even before it failed.
expectedStop := allocData.allocDesiredStatus == structs.AllocDesiredStatusStop
handleStoppingAllocationEvent(alloc, allocations, callbacks, expectedStop)
reschedulingExpected := allocData.allocDesiredStatus != structs.AllocDesiredStatusStop
handleStoppingAllocationEvent(alloc, allocations, callbacks, reschedulingExpected)
}
// handleLostAllocationEvent logs only the last of the multiple lost events.
func handleLostAllocationEvent(alloc *nomadApi.Allocation, allocData *allocationData,
allocations storage.Storage[*allocationData], callbacks *AllocationProcessing) {
// The stop is expected when the allocation desired to stop even before it got lost.
expectedStop := allocData.allocDesiredStatus == structs.AllocDesiredStatusStop
handleStoppingAllocationEvent(alloc, allocations, callbacks, expectedStop)
reschedulingExpected := allocData.allocDesiredStatus != structs.AllocDesiredStatusStop
handleStoppingAllocationEvent(alloc, allocations, callbacks, reschedulingExpected)
}
func handleStoppingAllocationEvent(alloc *nomadApi.Allocation, allocations storage.Storage[*allocationData],
callbacks *AllocationProcessing, expectedStop bool) {
callbacks *AllocationProcessing, reschedulingExpected bool) {
replacementAllocationScheduled := alloc.NextAllocation != ""
correctRescheduling := reschedulingExpected == replacementAllocationScheduled
removedByPoseidon := false
if alloc.NextAllocation == "" {
removedByPoseidon = callbacks.OnDeleted(alloc.JobID)
if !replacementAllocationScheduled {
var reason error
if correctRescheduling {
reason = ErrorAllocationStoppedUnexpectedly
} else {
reason = ErrorAllocationRescheduledUnexpectedly
}
removedByPoseidon = callbacks.OnDeleted(alloc.JobID, reason)
allocations.Delete(alloc.ID)
}
entry := log.WithField("job", alloc.JobID)
replacementAllocationScheduled := alloc.NextAllocation != ""
if !removedByPoseidon && expectedStop == replacementAllocationScheduled {
if !removedByPoseidon && !correctRescheduling {
entry.WithField("alloc", alloc).Warn("Unexpected Allocation Stopping / Restarting")
} else {
entry.Debug("Expected Allocation Stopping / Restarting")
entry.Trace("Expected Allocation Stopping / Restarting")
}
}

View File

@@ -26,7 +26,7 @@ import (
var (
noopAllocationProcessing = &AllocationProcessing{
OnNew: func(_ *nomadApi.Allocation, _ time.Duration) {},
OnDeleted: func(_ string) bool { return false },
OnDeleted: func(_ string, _ error) bool { return false },
}
ErrUnexpectedEOF = errors.New("unexpected EOF")
)
@@ -588,7 +588,7 @@ func TestHandleAllocationEvent_IgnoresReschedulesForStoppedJobs(t *testing.T) {
err := handleAllocationEvent(time.Now().UnixNano(), allocations, &rescheduledEvent, &AllocationProcessing{
OnNew: func(_ *nomadApi.Allocation, _ time.Duration) {},
OnDeleted: func(_ string) bool { return true },
OnDeleted: func(_ string, _ error) bool { return true },
})
require.NoError(t, err)
@@ -596,6 +596,28 @@ func TestHandleAllocationEvent_IgnoresReschedulesForStoppedJobs(t *testing.T) {
assert.False(t, ok)
}
func TestHandleAllocationEvent_ReportsOOMKilledStatus(t *testing.T) {
restartedAllocation := createRecentAllocation(structs.AllocClientStatusPending, structs.AllocDesiredStatusRun)
event := nomadApi.TaskEvent{Details: map[string]string{"oom_killed": "true"}}
state := nomadApi.TaskState{Restarts: 1, Events: []*nomadApi.TaskEvent{&event}}
restartedAllocation.TaskStates = map[string]*nomadApi.TaskState{TaskName: &state}
restartedEvent := eventForAllocation(t, restartedAllocation)
allocations := storage.NewLocalStorage[*allocationData]()
allocations.Add(restartedAllocation.ID, &allocationData{jobID: restartedAllocation.JobID})
var reason error
err := handleAllocationEvent(time.Now().UnixNano(), allocations, &restartedEvent, &AllocationProcessing{
OnNew: func(_ *nomadApi.Allocation, _ time.Duration) {},
OnDeleted: func(_ string, r error) bool {
reason = r
return true
},
})
require.NoError(t, err)
assert.ErrorIs(t, reason, ErrorOOMKilled)
}
func TestAPIClient_WatchAllocationsReturnsErrorWhenAllocationStreamCannotBeRetrieved(t *testing.T) {
apiMock := &apiQuerierMock{}
apiMock.On("EventStream", mock.Anything).Return(nil, tests.ErrDefault)
@@ -638,7 +660,7 @@ func assertWatchAllocation(t *testing.T, events []*nomadApi.Events,
OnNew: func(alloc *nomadApi.Allocation, _ time.Duration) {
newAllocations = append(newAllocations, alloc)
},
OnDeleted: func(jobID string) bool {
OnDeleted: func(jobID string, _ error) bool {
deletedAllocations = append(deletedAllocations, jobID)
return false
},