From b48c7fe8b6fa9226e2359735e5d9d80550f7e75a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20Pa=C3=9F?= <22845248+mpass99@users.noreply.github.com> Date: Tue, 5 Dec 2023 20:28:25 +0100 Subject: [PATCH] Configure Systemd Watchdog that monitors the reachability of Poseidon and automatically restarts Poseidon if required. --- .github/workflows/ci.yml | 19 ++++- .../resources/poseidon-minimal.service | 7 +- cmd/poseidon/main.go | 84 +++++++++++++++++-- cmd/poseidon/main_test.go | 2 +- tests/recovery/e2e_recovery_test.go | 19 +++++ tests/recovery/setup_test.go | 3 + 6 files changed, 122 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 64e4ec8..27e00de 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -208,6 +208,10 @@ jobs: ./poseidon | tee poseidon.log & until curl -s --fail http://localhost:7200/api/v1/health ; do sleep 1; done make e2e-test + - name: Write Environment Variables to file + run: | + echo "${{ vars }}" + if: ${{ success() || failure() }} - name: Setup Poseidon Socket run: | killall poseidon @@ -216,13 +220,22 @@ jobs: cat ./.github/workflows/resources/poseidon-minimal.service | envsubst > ~/.config/systemd/user/poseidon.service systemctl --user daemon-reload systemctl --user start poseidon.socket + if: ${{ success() || failure() }} - name: Print Poseidon Failure logs if: failure() - run: journalctl -xen --no-pager + run: journalctl --user -xen --no-pager - name: Run e2e recovery tests + run: make e2e-test-recovery + if: ${{ success() || failure() }} + - name: Print Systemd Failure logs run: | - tail -f /var/log/syslog & - make e2e-test-recovery + /usr/bin/systemctl --user show poseidon.service -p NRestarts + journalctl --user -xe -u poseidon.service --no-pager + if: failure() + - name: Stop Poseidon to flush the coverage file + run: | + systemctl --user stop poseidon.service poseidon.socket + ls -lah ${GOCOVERDIR} if: ${{ success() || failure() }} - name: Convert coverage reports run: make convert-run-coverage diff --git a/.github/workflows/resources/poseidon-minimal.service b/.github/workflows/resources/poseidon-minimal.service index 559178e..a8b3818 100644 --- a/.github/workflows/resources/poseidon-minimal.service +++ b/.github/workflows/resources/poseidon-minimal.service @@ -8,5 +8,10 @@ Requires=poseidon.socket [Service] WorkingDirectory=${GITHUB_WORKSPACE} ExecStart=${GITHUB_WORKSPACE}/poseidon -Restart=always Environment="POSEIDON_SERVER_SYSTEMDSOCKETACTIVATION=TRUE" + +Restart=always +StartLimitBurst=0 + +Type=notify +WatchdogSec=5 diff --git a/cmd/poseidon/main.go b/cmd/poseidon/main.go index 19e4f4c..ae3f309 100644 --- a/cmd/poseidon/main.go +++ b/cmd/poseidon/main.go @@ -2,11 +2,14 @@ package main import ( "context" + "crypto/tls" "errors" "fmt" "github.com/coreos/go-systemd/v22/activation" + "github.com/coreos/go-systemd/v22/daemon" "github.com/getsentry/sentry-go" sentryhttp "github.com/getsentry/sentry-go/http" + "github.com/gorilla/mux" "github.com/openHPI/poseidon/internal/api" "github.com/openHPI/poseidon/internal/config" "github.com/openHPI/poseidon/internal/environment" @@ -155,11 +158,12 @@ func watchMemoryAndAlert(options config.Profiling) { } } -func runServer(server *http.Server, cancel context.CancelFunc) { +func runServer(router *mux.Router, server *http.Server, cancel context.CancelFunc) { defer cancel() defer shutdownSentry() // shutdownSentry must be executed in the main goroutine. httpListeners := getHTTPListeners(server) + notifySystemd(router) serveHTTPListeners(server, httpListeners) } @@ -214,6 +218,67 @@ func serveHTTPListener(server *http.Server, l net.Listener) { } } +func notifySystemd(router *mux.Router) { + notify, err := daemon.SdNotify(false, daemon.SdNotifyReady) + switch { + case err == nil && !notify: + log.Debug("Systemd Readiness Notification not supported") + case err != nil: + log.WithError(err).WithField("notify", notify).Warn("Failed notifying Readiness to Systemd") + default: + log.Trace("Notified Readiness to Systemd") + } + + interval, err := daemon.SdWatchdogEnabled(false) + if err != nil || interval == 0 { + log.WithError(err).Error("Systemd Watchdog not supported") + return + } + go notifyWatchdog(context.Background(), router, interval) +} + +func notifyWatchdog(ctx context.Context, router *mux.Router, interval time.Duration) { + healthRoute, err := router.Get(api.HealthPath).URL() + if err != nil { + log.WithError(err).Error("Failed to parse Health route") + return + } + // We do not verify the certificate as we (intend to) perform only requests to the local server. + tlsConfig := &tls.Config{InsecureSkipVerify: true} // #nosec G402 The default min tls version is secure. + client := &http.Client{Transport: &http.Transport{TLSClientConfig: tlsConfig}} + + // notificationIntervalFactor defines how many more notifications we send than required. + const notificationIntervalFactor = 2 + for { + select { + case <-ctx.Done(): + return + case <-time.After(interval / notificationIntervalFactor): + req, err := http.NewRequestWithContext(ctx, http.MethodGet, config.Config.Server.URL().String()+healthRoute.String(), http.NoBody) + if err != nil { + continue + } + resp, err := client.Do(req) + if err != nil { + // We do not check for resp.StatusCode == 503 as Poseidon's error recovery will try to handle such errors + // by itself. The Watchdog should just check that Poseidon handles http requests at all. + continue + } + _ = resp.Body.Close() + + notify, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog) + switch { + case err == nil && !notify: + log.Debug("Systemd Watchdog Notification not supported") + case err != nil: + log.WithError(err).WithField("notify", notify).Warn("Failed notifying Systemd Watchdog") + default: + log.Trace("Notified Systemd Watchdog") + } + } + } +} + type managerCreator func(ctx context.Context) ( runnerManager runner.Manager, environmentManager environment.ManagerHandler) @@ -279,15 +344,19 @@ func createAWSManager(ctx context.Context) ( return runnerManager, environment.NewAWSEnvironmentManager(runnerManager) } -// initServer builds the http server and configures it with the chain of responsibility for multiple managers. -func initServer(ctx context.Context) *http.Server { +// initRouter builds a router that serves the API with the chain of responsibility for multiple managers. +func initRouter(ctx context.Context) *mux.Router { runnerManager, environmentManager := createManagerHandler(createNomadManager, config.Config.Nomad.Enabled, nil, nil, ctx) runnerManager, environmentManager = createManagerHandler(createAWSManager, config.Config.AWS.Enabled, runnerManager, environmentManager, ctx) - handler := api.NewRouter(runnerManager, environmentManager) - sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(handler) + return api.NewRouter(runnerManager, environmentManager) +} + +// initServer creates a server that serves the routes provided by the router. +func initServer(router *mux.Router) *http.Server { + sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(router) return &http.Server{ Addr: config.Config.Server.URL().Host, @@ -347,7 +416,8 @@ func main() { go watchMemoryAndAlert(config.Config.Profiling) ctx, cancel := context.WithCancel(context.Background()) - server := initServer(ctx) - go runServer(server, cancel) + router := initRouter(ctx) + server := initServer(router) + go runServer(router, server, cancel) shutdownOnOSSignal(server, ctx, stopProfiling) } diff --git a/cmd/poseidon/main_test.go b/cmd/poseidon/main_test.go index 2694454..7f28362 100644 --- a/cmd/poseidon/main_test.go +++ b/cmd/poseidon/main_test.go @@ -54,7 +54,7 @@ func (s *MainTestSuite) TestShutdownOnOSSignal_Profiling() { s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal waits for an exit after stopping the profiling. s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal triggers a os.Signal Goroutine. - server := initServer(disableRecovery) + server := initServer(initRouter(disableRecovery)) go shutdownOnOSSignal(server, context.Background(), func() { called = true }) diff --git a/tests/recovery/e2e_recovery_test.go b/tests/recovery/e2e_recovery_test.go index a1ec322..93ca24c 100644 --- a/tests/recovery/e2e_recovery_test.go +++ b/tests/recovery/e2e_recovery_test.go @@ -14,6 +14,9 @@ import ( "github.com/stretchr/testify/suite" "net/http" "os" + "os/exec" + "strconv" + "strings" "testing" "time" ) @@ -120,3 +123,19 @@ func (s *E2ERecoveryTestSuite) TestEnvironmentStatistics() { s.Equal(uint(PrewarmingPoolSize), environmentStatistics.IdleRunners) s.Equal(uint(1), environmentStatistics.UsedRunners) } + +func (s *E2ERecoveryTestSuite) TestWatchdogNotifications() { + // Wait for `WatchdogSec` to be passed. + <-time.After((5 + 1) * time.Second) + + // If the Watchdog has not received the notification by now it will restart Poseidon. + cmd := exec.Command("/usr/bin/systemctl", "--user", "show", "poseidon.service", "-p", "NRestarts") + s.Require().NoError(cmd.Err) + out, err := cmd.Output() + s.Require().NoError(err) + + restarts, err := strconv.Atoi(strings.Trim(strings.ReplaceAll(string(out), "NRestarts=", ""), "\n")) + s.Require().NoError(err) + // If Poseidon would not notify the systemd watchdog, we would have one more restart than expected. + s.Equal(PoseidonRestartCount, restarts) +} diff --git a/tests/recovery/setup_test.go b/tests/recovery/setup_test.go index 64ec3f9..88b2fb5 100644 --- a/tests/recovery/setup_test.go +++ b/tests/recovery/setup_test.go @@ -46,6 +46,8 @@ func waitForPoseidon() { } } +var PoseidonRestartCount = 0 + func killPoseidon() { processes, err := process.Processes() if err != nil { @@ -62,6 +64,7 @@ func killPoseidon() { log.WithError(err).Error("Error killing Poseidon") } else { log.Info("Killed Poseidon") + PoseidonRestartCount++ } } }