Add Prewarming Pool Alert

that checks for every environment if the filled share of the prewarmin pool is at least the specified threshold.
This commit is contained in:
Maximilian Paß
2023-10-31 13:06:53 +01:00
committed by Sebastian Serth
parent 1be3ce5ae1
commit c46a09eeae
9 changed files with 101 additions and 22 deletions

View File

@ -142,6 +142,7 @@ components:
- NOMAD_UNREACHABLE
- NOMAD_OVERLOAD
- NOMAD_INTERNAL_SERVER_ERROR
- PREWARMING_POOL_DEPLETING
- UNKNOWN
example: NOMAD_UNREACHABLE
@ -162,7 +163,8 @@ paths:
responses:
"204":
description: Everything okay
"503":
$ref: "#/components/responses/InternalServerError"
/version:
get:
summary: Retrieve the version of Poseidon

View File

@ -19,6 +19,10 @@ server:
interactivestderr: true
# If set, the file at the given path overwrites the default Nomad job file in internal/environment/template-environment-job.hcl
# templatejobfile: ./poseidon.hcl
# The prewarming pool alert threshold [0, 1) defines which part of the prewarming pool should always be filled.
# Setting it to 0 will disable the alert.
# If the prewarming pool is filled for less than, i.e., 50%, the health route of Poseidon will return a warning.
prewarmingpoolalertthreshold: 0.5
# Configuration of the used Nomad cluster
nomad:

View File

@ -46,7 +46,7 @@ func configureV1Router(router *mux.Router,
w.WriteHeader(http.StatusNotFound)
})
v1 := router.PathPrefix(BasePath).Subrouter()
v1.HandleFunc(HealthPath, Health).Methods(http.MethodGet).Name(HealthPath)
v1.HandleFunc(HealthPath, Health(environmentManager)).Methods(http.MethodGet).Name(HealthPath)
v1.HandleFunc(VersionPath, Version).Methods(http.MethodGet).Name(VersionPath)
runnerController := &RunnerController{manager: runnerManager}

View File

@ -1,12 +1,42 @@
package api
import (
"errors"
"fmt"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/environment"
"github.com/openHPI/poseidon/pkg/dto"
"net/http"
"strings"
)
var ErrorPrewarmingPoolDepleting = errors.New("the prewarming pool is depleting")
// Health handles the health route.
// It responds that the server is alive.
// If it is not, the response won't reach the client.
func Health(writer http.ResponseWriter, _ *http.Request) {
writer.WriteHeader(http.StatusNoContent)
func Health(manager environment.Manager) http.HandlerFunc {
return func(writer http.ResponseWriter, request *http.Request) {
if err := checkPrewarmingPool(manager); err != nil {
sendJSON(writer, &dto.InternalServerError{Message: err.Error(), ErrorCode: dto.PrewarmingPoolDepleting},
http.StatusServiceUnavailable, request.Context())
return
}
writer.WriteHeader(http.StatusNoContent)
}
}
func checkPrewarmingPool(manager environment.Manager) error {
var depletingEnvironments []int
for _, data := range manager.Statistics() {
if float64(data.IdleRunners)/float64(data.PrewarmingPoolSize) < config.Config.Server.PrewarmingPoolAlertThreshold {
depletingEnvironments = append(depletingEnvironments, data.ID)
}
}
if len(depletingEnvironments) > 0 {
arrayToString := strings.Trim(strings.Join(strings.Fields(fmt.Sprint(depletingEnvironments)), ","), "[]")
return fmt.Errorf("%w: environments %s", ErrorPrewarmingPoolDepleting, arrayToString)
}
return nil
}

View File

@ -1,16 +1,55 @@
package api
import (
"encoding/json"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/environment"
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/tests"
"io"
"net/http"
"net/http/httptest"
)
func (s *MainTestSuite) TestHealthRoute() {
request, err := http.NewRequest(http.MethodGet, "/health", http.NoBody)
if err != nil {
s.T().Fatal(err)
}
recorder := httptest.NewRecorder()
http.HandlerFunc(Health).ServeHTTP(recorder, request)
s.Equal(http.StatusNoContent, recorder.Code)
func (s *MainTestSuite) TestHealth() {
s.Run("returns StatusNoContent as default", func() {
request, err := http.NewRequest(http.MethodGet, "/health", http.NoBody)
if err != nil {
s.T().Fatal(err)
}
recorder := httptest.NewRecorder()
manager := &environment.ManagerHandlerMock{}
manager.On("Statistics").Return(map[dto.EnvironmentID]*dto.StatisticalExecutionEnvironmentData{})
Health(manager).ServeHTTP(recorder, request)
s.Equal(http.StatusNoContent, recorder.Code)
})
s.Run("returns InternalServerError for warnings and errors", func() {
s.Run("Prewarming Pool Alert", func() {
request, err := http.NewRequest(http.MethodGet, "/health", http.NoBody)
if err != nil {
s.T().Fatal(err)
}
recorder := httptest.NewRecorder()
manager := &environment.ManagerHandlerMock{}
manager.On("Statistics").Return(map[dto.EnvironmentID]*dto.StatisticalExecutionEnvironmentData{
tests.DefaultEnvironmentIDAsInteger: {
ID: tests.DefaultEnvironmentIDAsInteger,
PrewarmingPoolSize: 3,
IdleRunners: 1,
},
})
config.Config.Server.PrewarmingPoolAlertThreshold = 0.5
Health(manager).ServeHTTP(recorder, request)
s.Equal(http.StatusServiceUnavailable, recorder.Code)
b, err := io.ReadAll(recorder.Body)
s.Require().NoError(err)
var details dto.InternalServerError
err = json.Unmarshal(b, &details)
s.Require().NoError(err)
s.Contains(details.Message, ErrorPrewarmingPoolDepleting.Error())
})
})
}

View File

@ -29,8 +29,9 @@ var (
CertFile: "",
KeyFile: "",
},
InteractiveStderr: true,
TemplateJobFile: "",
InteractiveStderr: true,
TemplateJobFile: "",
PrewarmingPoolAlertThreshold: 0,
},
Nomad: Nomad{
Enabled: true,
@ -81,12 +82,13 @@ var (
// server configures the Poseidon webserver.
type server struct {
Address string
Port int
Token string
TLS TLS
InteractiveStderr bool
TemplateJobFile string
Address string
Port int
Token string
TLS TLS
InteractiveStderr bool
TemplateJobFile string
PrewarmingPoolAlertThreshold float64
}
// URL returns the URL of the Poseidon webserver.

View File

@ -60,8 +60,9 @@ func (a *AWSEnvironment) Sample() (r runner.Runner, ok bool) {
// The following methods are not supported at this moment.
// IdleRunnerCount is not supported as we have no information about the AWS managed prewarming pool.
// For the Poseidon Health check we default to 1.
func (a *AWSEnvironment) IdleRunnerCount() uint {
return 0
return 1
}
// PrewarmingPoolSize is neither supported nor required. It is handled transparently by AWS.

View File

@ -328,5 +328,6 @@ const (
ErrorNomadUnreachable ErrorCode = "NOMAD_UNREACHABLE"
ErrorNomadOverload ErrorCode = "NOMAD_OVERLOAD"
ErrorNomadInternalServerError ErrorCode = "NOMAD_INTERNAL_SERVER_ERROR"
PrewarmingPoolDepleting ErrorCode = "PREWARMING_POOL_DEPLETING"
ErrorUnknown ErrorCode = "UNKNOWN"
)