Fix loosing of rescheduled runners
that are rescheduled while the previous allocation was still pending. We fix this by removing the race condition handling that should prevent Poseidon from throwing warnings of unexpected allocation stopping.
This commit is contained in:

committed by
Sebastian Serth

parent
788cb0f660
commit
2eb15c8d93
@ -60,6 +60,9 @@ type allocationData struct {
|
|||||||
allocDesiredStatus string
|
allocDesiredStatus string
|
||||||
jobID string
|
jobID string
|
||||||
start time.Time
|
start time.Time
|
||||||
|
// stopExpected is used to suppress warnings that could be triggered by a race condition
|
||||||
|
// between the Inactivity timer and an external event leadng to allocation rescheduling.
|
||||||
|
stopExpected bool
|
||||||
// Just debugging information
|
// Just debugging information
|
||||||
allocNomadNode string
|
allocNomadNode string
|
||||||
}
|
}
|
||||||
@ -420,6 +423,7 @@ func updateAllocationData(
|
|||||||
// This allows the handling of startups and re-placements of allocations.
|
// This allows the handling of startups and re-placements of allocations.
|
||||||
func handlePendingAllocationEvent(alloc *nomadApi.Allocation, allocData *allocationData,
|
func handlePendingAllocationEvent(alloc *nomadApi.Allocation, allocData *allocationData,
|
||||||
allocations storage.Storage[*allocationData], callbacks *AllocationProcessing) {
|
allocations storage.Storage[*allocationData], callbacks *AllocationProcessing) {
|
||||||
|
var stopExpected bool
|
||||||
switch alloc.DesiredStatus {
|
switch alloc.DesiredStatus {
|
||||||
case structs.AllocDesiredStatusRun:
|
case structs.AllocDesiredStatusRun:
|
||||||
if allocData != nil {
|
if allocData != nil {
|
||||||
@ -432,15 +436,7 @@ func handlePendingAllocationEvent(alloc *nomadApi.Allocation, allocData *allocat
|
|||||||
} else if alloc.PreviousAllocation != "" {
|
} else if alloc.PreviousAllocation != "" {
|
||||||
// Handle Runner (/Container) re-allocations.
|
// Handle Runner (/Container) re-allocations.
|
||||||
if prevData, ok := allocations.Get(alloc.PreviousAllocation); ok {
|
if prevData, ok := allocations.Get(alloc.PreviousAllocation); ok {
|
||||||
if removedByPoseidon := callbacks.OnDeleted(prevData.jobID, ErrorAllocationRescheduled); removedByPoseidon {
|
stopExpected = callbacks.OnDeleted(prevData.jobID, ErrorAllocationRescheduled)
|
||||||
// This case handles a race condition between the overdue runner inactivity timeout and the rescheduling of a
|
|
||||||
// lost allocation. The race condition leads first to the rescheduling of the runner, but right after to it
|
|
||||||
// being stopped. Instead of reporting an unexpected stop of the pending runner, we just not start tracking it.
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Improve: When returning in the step before the allocation data will never be removed (data leak).
|
|
||||||
// But it also shouldn't be removed as the current event could be received multiple times.
|
|
||||||
// If we removed the allocation data the previous case handling wouldn't be triggered for the replicated event.
|
|
||||||
allocations.Delete(alloc.PreviousAllocation)
|
allocations.Delete(alloc.PreviousAllocation)
|
||||||
} else {
|
} else {
|
||||||
log.WithField("alloc", alloc).Warn("Previous Allocation not found")
|
log.WithField("alloc", alloc).Warn("Previous Allocation not found")
|
||||||
@ -454,9 +450,17 @@ func handlePendingAllocationEvent(alloc *nomadApi.Allocation, allocData *allocat
|
|||||||
jobID: alloc.JobID,
|
jobID: alloc.JobID,
|
||||||
start: time.Now(),
|
start: time.Now(),
|
||||||
allocNomadNode: alloc.NodeName,
|
allocNomadNode: alloc.NodeName,
|
||||||
|
stopExpected: stopExpected,
|
||||||
})
|
})
|
||||||
case structs.AllocDesiredStatusStop:
|
case structs.AllocDesiredStatusStop:
|
||||||
handleStoppingAllocationEvent(alloc, allocations, callbacks, false)
|
handleStoppingAllocationEvent(alloc, allocations, callbacks, false)
|
||||||
|
// Anyway, we want to monitor the occurrences.
|
||||||
|
if !allocData.stopExpected {
|
||||||
|
log.WithField("alloc", alloc).Warn("Pending allocation was stopped unexpectedly")
|
||||||
|
} else {
|
||||||
|
// ToDo: This log statement is just for measuring how common the mentioned race condition is. Remove it soon.
|
||||||
|
log.WithField("alloc", alloc).Warn("Pending allocation was stopped expectedly")
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
log.WithField("alloc", alloc).Warn("Other Desired Status")
|
log.WithField("alloc", alloc).Warn("Other Desired Status")
|
||||||
}
|
}
|
||||||
|
@ -578,26 +578,6 @@ func (s *MainTestSuite) TestHandleAllocationEventBuffersPendingAllocation() {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *MainTestSuite) TestHandleAllocationEvent_IgnoresReschedulesForStoppedJobs() {
|
|
||||||
startedAllocation := createRecentAllocation(structs.AllocClientStatusPending, structs.AllocDesiredStatusRun)
|
|
||||||
rescheduledAllocation := createRecentAllocation(structs.AllocClientStatusPending, structs.AllocDesiredStatusRun)
|
|
||||||
rescheduledAllocation.ID = tests.AnotherUUID
|
|
||||||
rescheduledAllocation.PreviousAllocation = startedAllocation.ID
|
|
||||||
rescheduledEvent := eventForAllocation(s.T(), rescheduledAllocation)
|
|
||||||
|
|
||||||
allocations := storage.NewLocalStorage[*allocationData]()
|
|
||||||
allocations.Add(startedAllocation.ID, &allocationData{jobID: startedAllocation.JobID})
|
|
||||||
|
|
||||||
err := handleAllocationEvent(time.Now().UnixNano(), allocations, &rescheduledEvent, &AllocationProcessing{
|
|
||||||
OnNew: func(_ *nomadApi.Allocation, _ time.Duration) {},
|
|
||||||
OnDeleted: func(_ string, _ error) bool { return true },
|
|
||||||
})
|
|
||||||
s.Require().NoError(err)
|
|
||||||
|
|
||||||
_, ok := allocations.Get(rescheduledAllocation.ID)
|
|
||||||
s.False(ok)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *MainTestSuite) TestHandleAllocationEvent_RegressionTest_14_09_2023() {
|
func (s *MainTestSuite) TestHandleAllocationEvent_RegressionTest_14_09_2023() {
|
||||||
jobID := "29-6f04b525-5315-11ee-af32-fa163e079f19"
|
jobID := "29-6f04b525-5315-11ee-af32-fa163e079f19"
|
||||||
a1ID := "04d86250-550c-62f9-9a21-ecdc3b38773e"
|
a1ID := "04d86250-550c-62f9-9a21-ecdc3b38773e"
|
||||||
|
Reference in New Issue
Block a user