Configure Systemd Watchdog

that monitors the reachability of Poseidon and automatically restarts Poseidon if required.
This commit is contained in:
Maximilian Paß
2023-12-05 20:28:25 +01:00
committed by Sebastian Serth
parent 2d34854450
commit b48c7fe8b6
6 changed files with 122 additions and 12 deletions

View File

@ -208,6 +208,10 @@ jobs:
./poseidon | tee poseidon.log & ./poseidon | tee poseidon.log &
until curl -s --fail http://localhost:7200/api/v1/health ; do sleep 1; done until curl -s --fail http://localhost:7200/api/v1/health ; do sleep 1; done
make e2e-test make e2e-test
- name: Write Environment Variables to file
run: |
echo "${{ vars }}"
if: ${{ success() || failure() }}
- name: Setup Poseidon Socket - name: Setup Poseidon Socket
run: | run: |
killall poseidon killall poseidon
@ -216,13 +220,22 @@ jobs:
cat ./.github/workflows/resources/poseidon-minimal.service | envsubst > ~/.config/systemd/user/poseidon.service cat ./.github/workflows/resources/poseidon-minimal.service | envsubst > ~/.config/systemd/user/poseidon.service
systemctl --user daemon-reload systemctl --user daemon-reload
systemctl --user start poseidon.socket systemctl --user start poseidon.socket
if: ${{ success() || failure() }}
- name: Print Poseidon Failure logs - name: Print Poseidon Failure logs
if: failure() if: failure()
run: journalctl -xen --no-pager run: journalctl --user -xen --no-pager
- name: Run e2e recovery tests - name: Run e2e recovery tests
run: make e2e-test-recovery
if: ${{ success() || failure() }}
- name: Print Systemd Failure logs
run: | run: |
tail -f /var/log/syslog & /usr/bin/systemctl --user show poseidon.service -p NRestarts
make e2e-test-recovery journalctl --user -xe -u poseidon.service --no-pager
if: failure()
- name: Stop Poseidon to flush the coverage file
run: |
systemctl --user stop poseidon.service poseidon.socket
ls -lah ${GOCOVERDIR}
if: ${{ success() || failure() }} if: ${{ success() || failure() }}
- name: Convert coverage reports - name: Convert coverage reports
run: make convert-run-coverage run: make convert-run-coverage

View File

@ -8,5 +8,10 @@ Requires=poseidon.socket
[Service] [Service]
WorkingDirectory=${GITHUB_WORKSPACE} WorkingDirectory=${GITHUB_WORKSPACE}
ExecStart=${GITHUB_WORKSPACE}/poseidon ExecStart=${GITHUB_WORKSPACE}/poseidon
Restart=always
Environment="POSEIDON_SERVER_SYSTEMDSOCKETACTIVATION=TRUE" Environment="POSEIDON_SERVER_SYSTEMDSOCKETACTIVATION=TRUE"
Restart=always
StartLimitBurst=0
Type=notify
WatchdogSec=5

View File

@ -2,11 +2,14 @@ package main
import ( import (
"context" "context"
"crypto/tls"
"errors" "errors"
"fmt" "fmt"
"github.com/coreos/go-systemd/v22/activation" "github.com/coreos/go-systemd/v22/activation"
"github.com/coreos/go-systemd/v22/daemon"
"github.com/getsentry/sentry-go" "github.com/getsentry/sentry-go"
sentryhttp "github.com/getsentry/sentry-go/http" sentryhttp "github.com/getsentry/sentry-go/http"
"github.com/gorilla/mux"
"github.com/openHPI/poseidon/internal/api" "github.com/openHPI/poseidon/internal/api"
"github.com/openHPI/poseidon/internal/config" "github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/environment" "github.com/openHPI/poseidon/internal/environment"
@ -155,11 +158,12 @@ func watchMemoryAndAlert(options config.Profiling) {
} }
} }
func runServer(server *http.Server, cancel context.CancelFunc) { func runServer(router *mux.Router, server *http.Server, cancel context.CancelFunc) {
defer cancel() defer cancel()
defer shutdownSentry() // shutdownSentry must be executed in the main goroutine. defer shutdownSentry() // shutdownSentry must be executed in the main goroutine.
httpListeners := getHTTPListeners(server) httpListeners := getHTTPListeners(server)
notifySystemd(router)
serveHTTPListeners(server, httpListeners) serveHTTPListeners(server, httpListeners)
} }
@ -214,6 +218,67 @@ func serveHTTPListener(server *http.Server, l net.Listener) {
} }
} }
func notifySystemd(router *mux.Router) {
notify, err := daemon.SdNotify(false, daemon.SdNotifyReady)
switch {
case err == nil && !notify:
log.Debug("Systemd Readiness Notification not supported")
case err != nil:
log.WithError(err).WithField("notify", notify).Warn("Failed notifying Readiness to Systemd")
default:
log.Trace("Notified Readiness to Systemd")
}
interval, err := daemon.SdWatchdogEnabled(false)
if err != nil || interval == 0 {
log.WithError(err).Error("Systemd Watchdog not supported")
return
}
go notifyWatchdog(context.Background(), router, interval)
}
func notifyWatchdog(ctx context.Context, router *mux.Router, interval time.Duration) {
healthRoute, err := router.Get(api.HealthPath).URL()
if err != nil {
log.WithError(err).Error("Failed to parse Health route")
return
}
// We do not verify the certificate as we (intend to) perform only requests to the local server.
tlsConfig := &tls.Config{InsecureSkipVerify: true} // #nosec G402 The default min tls version is secure.
client := &http.Client{Transport: &http.Transport{TLSClientConfig: tlsConfig}}
// notificationIntervalFactor defines how many more notifications we send than required.
const notificationIntervalFactor = 2
for {
select {
case <-ctx.Done():
return
case <-time.After(interval / notificationIntervalFactor):
req, err := http.NewRequestWithContext(ctx, http.MethodGet, config.Config.Server.URL().String()+healthRoute.String(), http.NoBody)
if err != nil {
continue
}
resp, err := client.Do(req)
if err != nil {
// We do not check for resp.StatusCode == 503 as Poseidon's error recovery will try to handle such errors
// by itself. The Watchdog should just check that Poseidon handles http requests at all.
continue
}
_ = resp.Body.Close()
notify, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog)
switch {
case err == nil && !notify:
log.Debug("Systemd Watchdog Notification not supported")
case err != nil:
log.WithError(err).WithField("notify", notify).Warn("Failed notifying Systemd Watchdog")
default:
log.Trace("Notified Systemd Watchdog")
}
}
}
}
type managerCreator func(ctx context.Context) ( type managerCreator func(ctx context.Context) (
runnerManager runner.Manager, environmentManager environment.ManagerHandler) runnerManager runner.Manager, environmentManager environment.ManagerHandler)
@ -279,15 +344,19 @@ func createAWSManager(ctx context.Context) (
return runnerManager, environment.NewAWSEnvironmentManager(runnerManager) return runnerManager, environment.NewAWSEnvironmentManager(runnerManager)
} }
// initServer builds the http server and configures it with the chain of responsibility for multiple managers. // initRouter builds a router that serves the API with the chain of responsibility for multiple managers.
func initServer(ctx context.Context) *http.Server { func initRouter(ctx context.Context) *mux.Router {
runnerManager, environmentManager := createManagerHandler(createNomadManager, config.Config.Nomad.Enabled, runnerManager, environmentManager := createManagerHandler(createNomadManager, config.Config.Nomad.Enabled,
nil, nil, ctx) nil, nil, ctx)
runnerManager, environmentManager = createManagerHandler(createAWSManager, config.Config.AWS.Enabled, runnerManager, environmentManager = createManagerHandler(createAWSManager, config.Config.AWS.Enabled,
runnerManager, environmentManager, ctx) runnerManager, environmentManager, ctx)
handler := api.NewRouter(runnerManager, environmentManager) return api.NewRouter(runnerManager, environmentManager)
sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(handler) }
// initServer creates a server that serves the routes provided by the router.
func initServer(router *mux.Router) *http.Server {
sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(router)
return &http.Server{ return &http.Server{
Addr: config.Config.Server.URL().Host, Addr: config.Config.Server.URL().Host,
@ -347,7 +416,8 @@ func main() {
go watchMemoryAndAlert(config.Config.Profiling) go watchMemoryAndAlert(config.Config.Profiling)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
server := initServer(ctx) router := initRouter(ctx)
go runServer(server, cancel) server := initServer(router)
go runServer(router, server, cancel)
shutdownOnOSSignal(server, ctx, stopProfiling) shutdownOnOSSignal(server, ctx, stopProfiling)
} }

View File

@ -54,7 +54,7 @@ func (s *MainTestSuite) TestShutdownOnOSSignal_Profiling() {
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal waits for an exit after stopping the profiling. s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal waits for an exit after stopping the profiling.
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal triggers a os.Signal Goroutine. s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal triggers a os.Signal Goroutine.
server := initServer(disableRecovery) server := initServer(initRouter(disableRecovery))
go shutdownOnOSSignal(server, context.Background(), func() { go shutdownOnOSSignal(server, context.Background(), func() {
called = true called = true
}) })

View File

@ -14,6 +14,9 @@ import (
"github.com/stretchr/testify/suite" "github.com/stretchr/testify/suite"
"net/http" "net/http"
"os" "os"
"os/exec"
"strconv"
"strings"
"testing" "testing"
"time" "time"
) )
@ -120,3 +123,19 @@ func (s *E2ERecoveryTestSuite) TestEnvironmentStatistics() {
s.Equal(uint(PrewarmingPoolSize), environmentStatistics.IdleRunners) s.Equal(uint(PrewarmingPoolSize), environmentStatistics.IdleRunners)
s.Equal(uint(1), environmentStatistics.UsedRunners) s.Equal(uint(1), environmentStatistics.UsedRunners)
} }
func (s *E2ERecoveryTestSuite) TestWatchdogNotifications() {
// Wait for `WatchdogSec` to be passed.
<-time.After((5 + 1) * time.Second)
// If the Watchdog has not received the notification by now it will restart Poseidon.
cmd := exec.Command("/usr/bin/systemctl", "--user", "show", "poseidon.service", "-p", "NRestarts")
s.Require().NoError(cmd.Err)
out, err := cmd.Output()
s.Require().NoError(err)
restarts, err := strconv.Atoi(strings.Trim(strings.ReplaceAll(string(out), "NRestarts=", ""), "\n"))
s.Require().NoError(err)
// If Poseidon would not notify the systemd watchdog, we would have one more restart than expected.
s.Equal(PoseidonRestartCount, restarts)
}

View File

@ -46,6 +46,8 @@ func waitForPoseidon() {
} }
} }
var PoseidonRestartCount = 0
func killPoseidon() { func killPoseidon() {
processes, err := process.Processes() processes, err := process.Processes()
if err != nil { if err != nil {
@ -62,6 +64,7 @@ func killPoseidon() {
log.WithError(err).Error("Error killing Poseidon") log.WithError(err).Error("Error killing Poseidon")
} else { } else {
log.Info("Killed Poseidon") log.Info("Killed Poseidon")
PoseidonRestartCount++
} }
} }
} }